Skip to content

Commit b5e4255

Browse files
address review comments
1 parent 70d4d4a commit b5e4255

2 files changed

Lines changed: 38 additions & 49 deletions

File tree

spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -298,21 +298,14 @@ private Result rebuildMetadata() {
298298
rewriteManifests(deltaSnapshots, endMetadata, rewriteManifestListResult.toRewrite());
299299

300300
// rebuild position delete files
301-
// Use DeleteFileSet to ensure proper equality comparison based on file location, content offset,
302-
// and content size. This is particularly important for deletion vectors (DV files) where
303-
// multiple DV entries can reference the same Puffin file but have different offsets and sizes.
304-
List<ContentFile<?>> allDeleteFiles =
301+
// Use DeleteFileSet to deduplicate based on file location, content offset, and content size.
302+
// This is important for deletion vectors (DVs) where multiple entries can reference the same
303+
// Puffin file but have different offsets and sizes.
304+
Set<DeleteFile> deleteFiles =
305305
rewriteManifestResult.toRewrite().stream()
306306
.filter(e -> e instanceof DeleteFile)
307-
.collect(Collectors.toList());
308-
Set<DeleteFile> deleteFiles =
309-
allDeleteFiles.stream()
310307
.map(e -> (DeleteFile) e)
311308
.collect(Collectors.toCollection(DeleteFileSet::create));
312-
LOG.debug(
313-
"Delete files before deduplication: {}, after deduplication with DeleteFileSet: {}",
314-
allDeleteFiles.size(),
315-
deleteFiles.size());
316309
rewritePositionDeletes(deleteFiles);
317310

318311
ImmutableRewriteTablePath.Result.Builder builder =

spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -568,26 +568,26 @@ public void testPositionDeletesAcrossFiles() throws Exception {
568568
/**
569569
* Test for https://github.com/apache/iceberg/issues/14814
570570
*
571-
* <p>This test verifies that rewrite_table_path correctly handles position delete files when
572-
* multiple snapshots exist. The fix uses DeleteFileSet to properly deduplicate delete files based
573-
* on file location, content offset, and content size, rather than relying on object identity.
571+
* <p>This test verifies that rewrite_table_path correctly deduplicates delete files when the same
572+
* delete file appears in multiple manifests. Without the DeleteFileSet fix, this test would fail
573+
* with AlreadyExistsException because DeleteFile objects don't override equals() and the same
574+
* file would be processed multiple times.
574575
*
575-
* <p>Note: The original bug occurred when the same delete file appeared in multiple manifest
576-
* entries (e.g., with ADDED and DELETED status), causing AlreadyExistsException. This test
577-
* exercises the code path with DeleteFileSet deduplication but may not fully reproduce the edge
578-
* case that occurs in production environments with complex manifest structures.
576+
* <p>The test creates a scenario where the same delete file is added to multiple snapshots,
577+
* causing it to appear in multiple manifest entries. When these manifests are processed, the same
578+
* delete file is returned as different object instances which need proper deduplication.
579579
*/
580580
@TestTemplate
581-
public void testPositionDeletesWithMultipleSnapshots() throws Exception {
581+
public void testPositionDeletesDeduplication() throws Exception {
582582
// Format versions 3 and 4 use Deletion Vectors stored in Puffin files, which have different
583-
// validation rules that prevent adding multiple position deletes for the same data file
583+
// validation rules that prevent adding the same delete file multiple times
584584
assumeThat(formatVersion)
585585
.as("Format versions 3+ use DVs with different validation rules")
586586
.isEqualTo(2);
587587

588588
Table tableWithPosDeletes =
589589
createTableWithSnapshots(
590-
tableDir.toFile().toURI().toString().concat("tableWithMultipleSnapshots"),
590+
tableDir.toFile().toURI().toString().concat("tableWithDuplicateDeletes"),
591591
2,
592592
Map.of(TableProperties.DELETE_DEFAULT_FILE_FORMAT, "parquet"));
593593

@@ -599,54 +599,50 @@ public void testPositionDeletesWithMultipleSnapshots() throws Exception {
599599
.iterator()
600600
.next();
601601

602-
// Create first position delete file
603-
List<Pair<CharSequence, Long>> deletes1 =
604-
Lists.newArrayList(Pair.of(dataFile.location(), 0L));
605-
File file1 =
602+
// Create a position delete file
603+
List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(Pair.of(dataFile.location(), 0L));
604+
File deleteFile =
606605
new File(
607-
removePrefix(tableWithPosDeletes.location() + "/data/deeply/nested/deletes_1.parquet"));
608-
DeleteFile positionDeletes1 =
606+
removePrefix(tableWithPosDeletes.location() + "/data/deeply/nested/deletes.parquet"));
607+
DeleteFile positionDeletes =
609608
FileHelpers.writeDeleteFile(
610609
tableWithPosDeletes,
611-
tableWithPosDeletes.io().newOutputFile(file1.toURI().toString()),
612-
deletes1,
610+
tableWithPosDeletes.io().newOutputFile(deleteFile.toURI().toString()),
611+
deletes,
613612
formatVersion)
614613
.first();
615-
tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes1).commit();
614+
615+
// Add the SAME delete file in the first snapshot
616+
tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
616617
long snapshot1 = tableWithPosDeletes.currentSnapshot().snapshotId();
617618

618-
// Create second position delete file
619-
List<Pair<CharSequence, Long>> deletes2 =
620-
Lists.newArrayList(Pair.of(dataFile.location(), 1L));
621-
File file2 =
622-
new File(
623-
removePrefix(tableWithPosDeletes.location() + "/data/deeply/nested/deletes_2.parquet"));
624-
DeleteFile positionDeletes2 =
625-
FileHelpers.writeDeleteFile(
626-
tableWithPosDeletes,
627-
tableWithPosDeletes.io().newOutputFile(file2.toURI().toString()),
628-
deletes2,
629-
formatVersion)
630-
.first();
631-
tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes2).commit();
619+
// Add the SAME delete file AGAIN in a second snapshot - this creates a duplicate entry
620+
// in a new manifest, which will cause duplicate DeleteFile objects when processing
621+
tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
632622
long snapshot2 = tableWithPosDeletes.currentSnapshot().snapshotId();
633623

634-
// Create tags on different snapshots (simulating the production scenario)
624+
// Create tags to ensure both snapshots are processed
635625
tableWithPosDeletes.manageSnapshots().createTag("tag1", snapshot1).commit();
636626
tableWithPosDeletes.manageSnapshots().createTag("tag2", snapshot2).commit();
637627

638-
// This should NOT throw AlreadyExistsException
628+
// This should NOT throw AlreadyExistsException - the fix uses DeleteFileSet to deduplicate
629+
// Without the fix (using Collectors.toSet()), this would fail because:
630+
// 1. Both manifests contain entries for the same delete file
631+
// 2. Processing returns two different DeleteFile objects for the same file
632+
// 3. HashSet doesn't deduplicate them (DeleteFile doesn't override equals())
633+
// 4. rewritePositionDeletes tries to write the same file twice -> AlreadyExistsException
639634
RewriteTablePath.Result result =
640635
actions()
641636
.rewriteTablePath(tableWithPosDeletes)
642637
.stagingLocation(stagingLocation())
643638
.rewriteLocationPrefix(tableWithPosDeletes.location(), targetTableLocation())
644639
.execute();
645640

646-
// Verify the rewrite completed successfully - should have rewritten 2 delete files
641+
// Verify the rewrite completed successfully - should have rewritten exactly 1 delete file
642+
// (the duplicate should be deduplicated by DeleteFileSet)
647643
assertThat(result.rewrittenDeleteFilePathsCount())
648-
.as("Should have rewritten exactly 2 delete files")
649-
.isEqualTo(2);
644+
.as("Should have rewritten exactly 1 delete file after deduplication")
645+
.isEqualTo(1);
650646

651647
// Copy the metadata files and data files
652648
copyTableFiles(result);

0 commit comments

Comments
 (0)