add split

ianardee · ianardee · commit 832ce6a7b3fb · 2026-04-30T16:06:48.000+02:00
diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java
@@ -7,7 +7,6 @@
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.List;
 import javax.imageio.ImageIO;
 import org.apache.pdfbox.Loader;
@@ -63,7 +62,7 @@ public BasePDFExtractor(LocalInputSource source) throws IOException {
    * @return a valid ImageIO buffer.
    * @throws IOException Throws if the file can't be accessed.
    */
-  public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
+  private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
     try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) {
       return ImageIO.read(stream);
     }
@@ -76,10 +75,8 @@ public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IO
    * @return A list of extracted files.
    * @throws IOException Throws if the file can't be accessed.
    */
-  public List<ExtractedPDF> extractSubDocuments(
-      List<List<Integer>> pageIndexes
-  ) throws IOException {
-    var extractedPDFs = new ArrayList<ExtractedPDF>();
+  public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws IOException {
+    var extractedPDFs = new ExtractedPDFs();
 
     for (List<Integer> pageIndexElement : pageIndexes) {
       if (pageIndexElement.isEmpty()) {
@@ -94,10 +91,7 @@ public List<ExtractedPDF> extractSubDocuments(
           .replace(" ", "0")
         + "."
         + splitName[1];
-      extractedPDFs
-        .add(
-          new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename)
-        );
+      extractedPDFs.add(extractSinglePage(pageIndexElement, fieldFilename, false));
     }
     return extractedPDFs;
   }
@@ -136,11 +130,27 @@ private static byte[] createPdfFromExistingPdf(
     return output;
   }
 
-  public byte[] mergePdfPages(
-      PDDocument document,
+  public ExtractedPDF extractSinglePage(
+      List<Integer> pageNumbers,
+      String fieldFilename,
+      boolean closeOriginal
+  ) throws IOException {
+    var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
+    return new ExtractedPDF(pdfBytes, fieldFilename);
+  }
+
+  public ExtractedPDF extractSinglePage(
       List<Integer> pageNumbers,
       boolean closeOriginal
   ) throws IOException {
-    return createPdfFromExistingPdf(document, pageNumbers, closeOriginal);
+    var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
+    String[] splitName = InputSourceUtils.splitNameStrict(filename);
+    String fieldFilename = splitName[0]
+      + String.format("_%3s", pageNumbers.get(0) + 1).replace(" ", "0")
+      + "-"
+      + String.format("%3s", pageNumbers.get(pageNumbers.size() - 1) + 1).replace(" ", "0")
+      + "."
+      + splitName[1];
+    return new ExtractedPDF(pdfBytes, fieldFilename);
   }
 }
diff --git a/src/main/java/com/mindee/pdf/ExtractedPDFs.java b/src/main/java/com/mindee/pdf/ExtractedPDFs.java
@@ -0,0 +1,6 @@
+package com.mindee.pdf;
+
+import java.util.ArrayList;
+
+public class ExtractedPDFs extends ArrayList<ExtractedPDF> {
+}
diff --git a/src/main/java/com/mindee/v2/fileOperations/Crop.java b/src/main/java/com/mindee/v2/fileOperations/Crop.java
@@ -15,12 +15,12 @@ public Crop(LocalInputSource inputSource) throws IOException {
     this.imageExtractor = new ImageExtractor(inputSource);
   }
 
-  public ExtractedImage extractSingleCrop(CropItem cropItem) throws IOException {
+  public ExtractedImage extractSingle(CropItem cropItem) throws IOException {
     return this.imageExtractor
       .extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), 0);
   }
 
-  public ExtractedImages extractCrops(List<CropItem> cropItems) {
+  public ExtractedImages extractMultiple(List<CropItem> cropItems) {
     var extractedImages = new ExtractedImages();
     for (int i = 0; i < cropItems.size(); i++) {
       var cropItem = cropItems.get(i);
diff --git a/src/main/java/com/mindee/v2/fileOperations/Split.java b/src/main/java/com/mindee/v2/fileOperations/Split.java
@@ -0,0 +1,29 @@
+package com.mindee.v2.fileOperations;
+
+import com.mindee.input.LocalInputSource;
+import com.mindee.pdf.BasePDFExtractor;
+import com.mindee.pdf.ExtractedPDF;
+import com.mindee.pdf.ExtractedPDFs;
+import com.mindee.v2.product.split.SplitRange;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.stream.Collectors;
+
+public class Split {
+  BasePDFExtractor pdfSplitter;
+
+  public Split(LocalInputSource inputSource) throws IOException {
+    this.pdfSplitter = new BasePDFExtractor(inputSource);
+  }
+
+  public ExtractedPDF extractSingle(SplitRange splitRange) throws IOException {
+    return this.pdfSplitter.extractSinglePage(splitRange.getPageRangeDistinct(), true);
+  }
+
+  public ExtractedPDFs extractMultiple(ArrayList<SplitRange> splitRanges) throws IOException {
+    return this.pdfSplitter
+      .extractSubDocuments(
+        splitRanges.stream().map(SplitRange::getPageRangeDistinct).collect(Collectors.toList())
+      );
+  }
+}
diff --git a/src/main/java/com/mindee/v2/product/split/SplitRange.java b/src/main/java/com/mindee/v2/product/split/SplitRange.java
@@ -3,6 +3,8 @@
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
 import lombok.AllArgsConstructor;
 import lombok.EqualsAndHashCode;
 import lombok.Getter;
@@ -22,11 +24,18 @@ public class SplitRange {
    * indicates the end page.
    */
   @JsonProperty("page_range")
-  public ArrayList<Integer> pageRange;
+  public List<Integer> pageRange;
 
   /**
    * The document type, as identified on given classification values.
    */
   @JsonProperty("document_type")
   public String documentType;
+
+  /**
+   * Returns a list of distinct page ranges. Useful for extracting pages from a document.
+   */
+  public List<Integer> getPageRangeDistinct() {
+    return new ArrayList<>(new LinkedHashSet<>(this.pageRange));
+  }
 }
diff --git a/src/test/java/com/mindee/v2/fileOperations/CropTest.java b/src/test/java/com/mindee/v2/fileOperations/CropTest.java
@@ -19,7 +19,7 @@ void singlePageSingleCrop_cropsCorrectly() throws Exception {
     var doc = localResponse.deserializeResponse(CropResponse.class);
 
     var extractedCrop = new Crop(inputSample)
-      .extractSingleCrop(doc.getInference().getResult().getCrops().get(0));
+      .extractSingle(doc.getInference().getResult().getCrops().get(0));
 
     assertEquals(0, extractedCrop.getPageId());
     assertEquals("default_sample_000.jpg", extractedCrop.getFilename());
@@ -35,7 +35,7 @@ void singlePageMultiCrop_cropsCorrectly() throws Exception {
     var doc = localResponse.deserializeResponse(CropResponse.class);
 
     var extractedCrops = new Crop(inputSample)
-      .extractCrops(doc.getInference().getResult().getCrops());
+      .extractMultiple(doc.getInference().getResult().getCrops());
 
     assertEquals(2, extractedCrops.size());
 
@@ -59,7 +59,7 @@ void multiPageMultiCrop_cropsCorrectly() throws Exception {
     var doc = localResponse.deserializeResponse(CropResponse.class);
 
     var extractedCrops = new Crop(inputSample)
-      .extractCrops(doc.getInference().getResult().getCrops());
+      .extractMultiple(doc.getInference().getResult().getCrops());
 
     assertEquals(5, extractedCrops.size());
 
diff --git a/src/test/java/com/mindee/v2/fileOperations/SplitTest.java b/src/test/java/com/mindee/v2/fileOperations/SplitTest.java
@@ -0,0 +1,50 @@
+package com.mindee.v2.fileOperations;
+
+import static com.mindee.TestingUtilities.getV2ResourcePath;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import com.mindee.input.LocalInputSource;
+import com.mindee.v2.parsing.LocalResponse;
+import com.mindee.v2.product.split.SplitResponse;
+import java.io.IOException;
+import org.junit.jupiter.api.Test;
+
+public class SplitTest {
+  @Test
+  void singlePage_splitsCorrectly() throws IOException {
+    var inputSample = new LocalInputSource(getV2ResourcePath("products/split/default_sample.pdf"));
+    assertEquals(2, inputSample.getPageCount());
+    var localResponse = new LocalResponse(getV2ResourcePath("products/split/default_sample.json"));
+    var doc = localResponse.deserializeResponse(SplitResponse.class);
+
+    var extractedSplit = new Split(inputSample)
+      .extractSingle(doc.getInference().getResult().getSplits().get(0));
+
+    assertEquals("default_sample_001-001.pdf", extractedSplit.getFilename());
+    var asInputSource = extractedSplit.asInputSource();
+    assertEquals(1, asInputSource.getPageCount());
+  }
+
+  @Test
+  void multiplePages_splitsCorrectly() throws IOException {
+    var inputSample = new LocalInputSource(getV2ResourcePath("products/split/default_sample.pdf"));
+    assertEquals(2, inputSample.getPageCount());
+    var localResponse = new LocalResponse(getV2ResourcePath("products/split/default_sample.json"));
+    var doc = localResponse.deserializeResponse(SplitResponse.class);
+
+    var extractedSplits = new Split(inputSample)
+      .extractMultiple(doc.getInference().getResult().getSplits());
+
+    assertEquals(2, extractedSplits.size());
+
+    var split0 = extractedSplits.get(0);
+    assertEquals("default_sample_001-001.pdf", split0.getFilename());
+    var asInputSource0 = split0.asInputSource();
+    assertEquals(1, asInputSource0.getPageCount());
+
+    var split1 = extractedSplits.get(1);
+    assertEquals("default_sample_002-002.pdf", split1.getFilename());
+    var asInputSource1 = split1.asInputSource();
+    assertEquals(1, asInputSource1.getPageCount());
+  }
+}
diff --git a/src/test/resources b/src/test/resources
@@ -1 +1 @@
-Subproject commit 13d257739b163a302866762ae3ee097d4d316bb8
+Subproject commit 4cec007c9b9ec7a9f0399fa900914fa51a47308c

Original file line number	Diff line number	Diff line change
`@@ -15,12 +15,12 @@ public Crop(LocalInputSource inputSource) throws IOException {`
`15`	`15`	`this.imageExtractor = new ImageExtractor(inputSource);`
`16`	`16`	`}`
`17`	`17`
`18`		`- public ExtractedImage extractSingleCrop(CropItem cropItem) throws IOException {`
	`18`	`+ public ExtractedImage extractSingle(CropItem cropItem) throws IOException {`
`19`	`19`	`return this.imageExtractor`
`20`	`20`	`.extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), 0);`
`21`	`21`	`}`
`22`	`22`
`23`		`- public ExtractedImages extractCrops(List<CropItem> cropItems) {`
	`23`	`+ public ExtractedImages extractMultiple(List<CropItem> cropItems) {`
`24`	`24`	`var extractedImages = new ExtractedImages();`
`25`	`25`	`for (int i = 0; i < cropItems.size(); i++) {`
`26`	`26`	`var cropItem = cropItems.get(i);`