Skip to content

Commit 832ce6a

Browse files
committed
add split
1 parent 50253d4 commit 832ce6a

8 files changed

Lines changed: 124 additions & 20 deletions

File tree

src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import java.io.ByteArrayInputStream;
88
import java.io.ByteArrayOutputStream;
99
import java.io.IOException;
10-
import java.util.ArrayList;
1110
import java.util.List;
1211
import javax.imageio.ImageIO;
1312
import org.apache.pdfbox.Loader;
@@ -63,7 +62,7 @@ public BasePDFExtractor(LocalInputSource source) throws IOException {
6362
* @return a valid ImageIO buffer.
6463
* @throws IOException Throws if the file can't be accessed.
6564
*/
66-
public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
65+
private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
6766
try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) {
6867
return ImageIO.read(stream);
6968
}
@@ -76,10 +75,8 @@ public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IO
7675
* @return A list of extracted files.
7776
* @throws IOException Throws if the file can't be accessed.
7877
*/
79-
public List<ExtractedPDF> extractSubDocuments(
80-
List<List<Integer>> pageIndexes
81-
) throws IOException {
82-
var extractedPDFs = new ArrayList<ExtractedPDF>();
78+
public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws IOException {
79+
var extractedPDFs = new ExtractedPDFs();
8380

8481
for (List<Integer> pageIndexElement : pageIndexes) {
8582
if (pageIndexElement.isEmpty()) {
@@ -94,10 +91,7 @@ public List<ExtractedPDF> extractSubDocuments(
9491
.replace(" ", "0")
9592
+ "."
9693
+ splitName[1];
97-
extractedPDFs
98-
.add(
99-
new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename)
100-
);
94+
extractedPDFs.add(extractSinglePage(pageIndexElement, fieldFilename, false));
10195
}
10296
return extractedPDFs;
10397
}
@@ -136,11 +130,27 @@ private static byte[] createPdfFromExistingPdf(
136130
return output;
137131
}
138132

139-
public byte[] mergePdfPages(
140-
PDDocument document,
133+
public ExtractedPDF extractSinglePage(
134+
List<Integer> pageNumbers,
135+
String fieldFilename,
136+
boolean closeOriginal
137+
) throws IOException {
138+
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
139+
return new ExtractedPDF(pdfBytes, fieldFilename);
140+
}
141+
142+
public ExtractedPDF extractSinglePage(
141143
List<Integer> pageNumbers,
142144
boolean closeOriginal
143145
) throws IOException {
144-
return createPdfFromExistingPdf(document, pageNumbers, closeOriginal);
146+
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
147+
String[] splitName = InputSourceUtils.splitNameStrict(filename);
148+
String fieldFilename = splitName[0]
149+
+ String.format("_%3s", pageNumbers.get(0) + 1).replace(" ", "0")
150+
+ "-"
151+
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1) + 1).replace(" ", "0")
152+
+ "."
153+
+ splitName[1];
154+
return new ExtractedPDF(pdfBytes, fieldFilename);
145155
}
146156
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package com.mindee.pdf;
2+
3+
import java.util.ArrayList;
4+
5+
public class ExtractedPDFs extends ArrayList<ExtractedPDF> {
6+
}

src/main/java/com/mindee/v2/fileOperations/Crop.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ public Crop(LocalInputSource inputSource) throws IOException {
1515
this.imageExtractor = new ImageExtractor(inputSource);
1616
}
1717

18-
public ExtractedImage extractSingleCrop(CropItem cropItem) throws IOException {
18+
public ExtractedImage extractSingle(CropItem cropItem) throws IOException {
1919
return this.imageExtractor
2020
.extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), 0);
2121
}
2222

23-
public ExtractedImages extractCrops(List<CropItem> cropItems) {
23+
public ExtractedImages extractMultiple(List<CropItem> cropItems) {
2424
var extractedImages = new ExtractedImages();
2525
for (int i = 0; i < cropItems.size(); i++) {
2626
var cropItem = cropItems.get(i);
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package com.mindee.v2.fileOperations;
2+
3+
import com.mindee.input.LocalInputSource;
4+
import com.mindee.pdf.BasePDFExtractor;
5+
import com.mindee.pdf.ExtractedPDF;
6+
import com.mindee.pdf.ExtractedPDFs;
7+
import com.mindee.v2.product.split.SplitRange;
8+
import java.io.IOException;
9+
import java.util.ArrayList;
10+
import java.util.stream.Collectors;
11+
12+
public class Split {
13+
BasePDFExtractor pdfSplitter;
14+
15+
public Split(LocalInputSource inputSource) throws IOException {
16+
this.pdfSplitter = new BasePDFExtractor(inputSource);
17+
}
18+
19+
public ExtractedPDF extractSingle(SplitRange splitRange) throws IOException {
20+
return this.pdfSplitter.extractSinglePage(splitRange.getPageRangeDistinct(), true);
21+
}
22+
23+
public ExtractedPDFs extractMultiple(ArrayList<SplitRange> splitRanges) throws IOException {
24+
return this.pdfSplitter
25+
.extractSubDocuments(
26+
splitRanges.stream().map(SplitRange::getPageRangeDistinct).collect(Collectors.toList())
27+
);
28+
}
29+
}

src/main/java/com/mindee/v2/product/split/SplitRange.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
44
import com.fasterxml.jackson.annotation.JsonProperty;
55
import java.util.ArrayList;
6+
import java.util.LinkedHashSet;
7+
import java.util.List;
68
import lombok.AllArgsConstructor;
79
import lombok.EqualsAndHashCode;
810
import lombok.Getter;
@@ -22,11 +24,18 @@ public class SplitRange {
2224
* indicates the end page.
2325
*/
2426
@JsonProperty("page_range")
25-
public ArrayList<Integer> pageRange;
27+
public List<Integer> pageRange;
2628

2729
/**
2830
* The document type, as identified on given classification values.
2931
*/
3032
@JsonProperty("document_type")
3133
public String documentType;
34+
35+
/**
36+
* Returns a list of distinct page ranges. Useful for extracting pages from a document.
37+
*/
38+
public List<Integer> getPageRangeDistinct() {
39+
return new ArrayList<>(new LinkedHashSet<>(this.pageRange));
40+
}
3241
}

src/test/java/com/mindee/v2/fileOperations/CropTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ void singlePageSingleCrop_cropsCorrectly() throws Exception {
1919
var doc = localResponse.deserializeResponse(CropResponse.class);
2020

2121
var extractedCrop = new Crop(inputSample)
22-
.extractSingleCrop(doc.getInference().getResult().getCrops().get(0));
22+
.extractSingle(doc.getInference().getResult().getCrops().get(0));
2323

2424
assertEquals(0, extractedCrop.getPageId());
2525
assertEquals("default_sample_000.jpg", extractedCrop.getFilename());
@@ -35,7 +35,7 @@ void singlePageMultiCrop_cropsCorrectly() throws Exception {
3535
var doc = localResponse.deserializeResponse(CropResponse.class);
3636

3737
var extractedCrops = new Crop(inputSample)
38-
.extractCrops(doc.getInference().getResult().getCrops());
38+
.extractMultiple(doc.getInference().getResult().getCrops());
3939

4040
assertEquals(2, extractedCrops.size());
4141

@@ -59,7 +59,7 @@ void multiPageMultiCrop_cropsCorrectly() throws Exception {
5959
var doc = localResponse.deserializeResponse(CropResponse.class);
6060

6161
var extractedCrops = new Crop(inputSample)
62-
.extractCrops(doc.getInference().getResult().getCrops());
62+
.extractMultiple(doc.getInference().getResult().getCrops());
6363

6464
assertEquals(5, extractedCrops.size());
6565

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package com.mindee.v2.fileOperations;
2+
3+
import static com.mindee.TestingUtilities.getV2ResourcePath;
4+
import static org.junit.jupiter.api.Assertions.assertEquals;
5+
6+
import com.mindee.input.LocalInputSource;
7+
import com.mindee.v2.parsing.LocalResponse;
8+
import com.mindee.v2.product.split.SplitResponse;
9+
import java.io.IOException;
10+
import org.junit.jupiter.api.Test;
11+
12+
public class SplitTest {
13+
@Test
14+
void singlePage_splitsCorrectly() throws IOException {
15+
var inputSample = new LocalInputSource(getV2ResourcePath("products/split/default_sample.pdf"));
16+
assertEquals(2, inputSample.getPageCount());
17+
var localResponse = new LocalResponse(getV2ResourcePath("products/split/default_sample.json"));
18+
var doc = localResponse.deserializeResponse(SplitResponse.class);
19+
20+
var extractedSplit = new Split(inputSample)
21+
.extractSingle(doc.getInference().getResult().getSplits().get(0));
22+
23+
assertEquals("default_sample_001-001.pdf", extractedSplit.getFilename());
24+
var asInputSource = extractedSplit.asInputSource();
25+
assertEquals(1, asInputSource.getPageCount());
26+
}
27+
28+
@Test
29+
void multiplePages_splitsCorrectly() throws IOException {
30+
var inputSample = new LocalInputSource(getV2ResourcePath("products/split/default_sample.pdf"));
31+
assertEquals(2, inputSample.getPageCount());
32+
var localResponse = new LocalResponse(getV2ResourcePath("products/split/default_sample.json"));
33+
var doc = localResponse.deserializeResponse(SplitResponse.class);
34+
35+
var extractedSplits = new Split(inputSample)
36+
.extractMultiple(doc.getInference().getResult().getSplits());
37+
38+
assertEquals(2, extractedSplits.size());
39+
40+
var split0 = extractedSplits.get(0);
41+
assertEquals("default_sample_001-001.pdf", split0.getFilename());
42+
var asInputSource0 = split0.asInputSource();
43+
assertEquals(1, asInputSource0.getPageCount());
44+
45+
var split1 = extractedSplits.get(1);
46+
assertEquals("default_sample_002-002.pdf", split1.getFilename());
47+
var asInputSource1 = split1.asInputSource();
48+
assertEquals(1, asInputSource1.getPageCount());
49+
}
50+
}

src/test/resources

0 commit comments

Comments
 (0)