@@ -568,26 +568,26 @@ public void testPositionDeletesAcrossFiles() throws Exception {
568568 /**
569569 * Test for https://github.com/apache/iceberg/issues/14814
570570 *
571- * <p>This test verifies that rewrite_table_path correctly handles position delete files when
572- * multiple snapshots exist. The fix uses DeleteFileSet to properly deduplicate delete files based
573- * on file location, content offset, and content size, rather than relying on object identity.
571+ * <p>This test verifies that rewrite_table_path correctly deduplicates delete files when the same
572+ * delete file appears in multiple manifests. Without the DeleteFileSet fix, this test would fail
573+ * with AlreadyExistsException because DeleteFile objects don't override equals() and the same
574+ * file would be processed multiple times.
574575 *
575- * <p>Note: The original bug occurred when the same delete file appeared in multiple manifest
576- * entries (e.g., with ADDED and DELETED status), causing AlreadyExistsException. This test
577- * exercises the code path with DeleteFileSet deduplication but may not fully reproduce the edge
578- * case that occurs in production environments with complex manifest structures.
576+ * <p>The test creates a scenario where the same delete file is added to multiple snapshots,
577+ * causing it to appear in multiple manifest entries. When these manifests are processed, the same
578+ * delete file is returned as different object instances which need proper deduplication.
579579 */
580580 @ TestTemplate
581- public void testPositionDeletesWithMultipleSnapshots () throws Exception {
581+ public void testPositionDeletesDeduplication () throws Exception {
582582 // Format versions 3 and 4 use Deletion Vectors stored in Puffin files, which have different
583- // validation rules that prevent adding multiple position deletes for the same data file
583+ // validation rules that prevent adding the same delete file multiple times
584584 assumeThat (formatVersion )
585585 .as ("Format versions 3+ use DVs with different validation rules" )
586586 .isEqualTo (2 );
587587
588588 Table tableWithPosDeletes =
589589 createTableWithSnapshots (
590- tableDir .toFile ().toURI ().toString ().concat ("tableWithMultipleSnapshots " ),
590+ tableDir .toFile ().toURI ().toString ().concat ("tableWithDuplicateDeletes " ),
591591 2 ,
592592 Map .of (TableProperties .DELETE_DEFAULT_FILE_FORMAT , "parquet" ));
593593
@@ -599,54 +599,50 @@ public void testPositionDeletesWithMultipleSnapshots() throws Exception {
599599 .iterator ()
600600 .next ();
601601
602- // Create first position delete file
603- List <Pair <CharSequence , Long >> deletes1 =
604- Lists .newArrayList (Pair .of (dataFile .location (), 0L ));
605- File file1 =
602+ // Create a position delete file
603+ List <Pair <CharSequence , Long >> deletes = Lists .newArrayList (Pair .of (dataFile .location (), 0L ));
604+ File deleteFile =
606605 new File (
607- removePrefix (tableWithPosDeletes .location () + "/data/deeply/nested/deletes_1 .parquet" ));
608- DeleteFile positionDeletes1 =
606+ removePrefix (tableWithPosDeletes .location () + "/data/deeply/nested/deletes .parquet" ));
607+ DeleteFile positionDeletes =
609608 FileHelpers .writeDeleteFile (
610609 tableWithPosDeletes ,
611- tableWithPosDeletes .io ().newOutputFile (file1 .toURI ().toString ()),
612- deletes1 ,
610+ tableWithPosDeletes .io ().newOutputFile (deleteFile .toURI ().toString ()),
611+ deletes ,
613612 formatVersion )
614613 .first ();
615- tableWithPosDeletes .newRowDelta ().addDeletes (positionDeletes1 ).commit ();
614+
615+ // Add the SAME delete file in the first snapshot
616+ tableWithPosDeletes .newRowDelta ().addDeletes (positionDeletes ).commit ();
616617 long snapshot1 = tableWithPosDeletes .currentSnapshot ().snapshotId ();
617618
618- // Create second position delete file
619- List <Pair <CharSequence , Long >> deletes2 =
620- Lists .newArrayList (Pair .of (dataFile .location (), 1L ));
621- File file2 =
622- new File (
623- removePrefix (tableWithPosDeletes .location () + "/data/deeply/nested/deletes_2.parquet" ));
624- DeleteFile positionDeletes2 =
625- FileHelpers .writeDeleteFile (
626- tableWithPosDeletes ,
627- tableWithPosDeletes .io ().newOutputFile (file2 .toURI ().toString ()),
628- deletes2 ,
629- formatVersion )
630- .first ();
631- tableWithPosDeletes .newRowDelta ().addDeletes (positionDeletes2 ).commit ();
619+ // Add the SAME delete file AGAIN in a second snapshot - this creates a duplicate entry
620+ // in a new manifest, which will cause duplicate DeleteFile objects when processing
621+ tableWithPosDeletes .newRowDelta ().addDeletes (positionDeletes ).commit ();
632622 long snapshot2 = tableWithPosDeletes .currentSnapshot ().snapshotId ();
633623
634- // Create tags on different snapshots (simulating the production scenario)
624+ // Create tags to ensure both snapshots are processed
635625 tableWithPosDeletes .manageSnapshots ().createTag ("tag1" , snapshot1 ).commit ();
636626 tableWithPosDeletes .manageSnapshots ().createTag ("tag2" , snapshot2 ).commit ();
637627
638- // This should NOT throw AlreadyExistsException
628+ // This should NOT throw AlreadyExistsException - the fix uses DeleteFileSet to deduplicate
629+ // Without the fix (using Collectors.toSet()), this would fail because:
630+ // 1. Both manifests contain entries for the same delete file
631+ // 2. Processing returns two different DeleteFile objects for the same file
632+ // 3. HashSet doesn't deduplicate them (DeleteFile doesn't override equals())
633+ // 4. rewritePositionDeletes tries to write the same file twice -> AlreadyExistsException
639634 RewriteTablePath .Result result =
640635 actions ()
641636 .rewriteTablePath (tableWithPosDeletes )
642637 .stagingLocation (stagingLocation ())
643638 .rewriteLocationPrefix (tableWithPosDeletes .location (), targetTableLocation ())
644639 .execute ();
645640
646- // Verify the rewrite completed successfully - should have rewritten 2 delete files
641+ // Verify the rewrite completed successfully - should have rewritten exactly 1 delete file
642+ // (the duplicate should be deduplicated by DeleteFileSet)
647643 assertThat (result .rewrittenDeleteFilePathsCount ())
648- .as ("Should have rewritten exactly 2 delete files " )
649- .isEqualTo (2 );
644+ .as ("Should have rewritten exactly 1 delete file after deduplication " )
645+ .isEqualTo (1 );
650646
651647 // Copy the metadata files and data files
652648 copyTableFiles (result );
0 commit comments