diff --git a/src/main/java/com/mindee/image/ExtractedImage.java b/src/main/java/com/mindee/image/ExtractedImage.java index 319f98f5d..7b3d0fef9 100644 --- a/src/main/java/com/mindee/image/ExtractedImage.java +++ b/src/main/java/com/mindee/image/ExtractedImage.java @@ -17,6 +17,7 @@ public class ExtractedImage { private final BufferedImage image; private final String filename; private final String saveFormat; + private final int pageId; /** * Default constructor. @@ -25,10 +26,11 @@ public class ExtractedImage { * @param filename Name of the extracted image. * @param saveFormat Format to save the image as, defaults to PNG. */ - public ExtractedImage(BufferedImage image, String filename, String saveFormat) { + public ExtractedImage(BufferedImage image, String filename, String saveFormat, int pageId) { this.image = image; this.filename = filename; this.saveFormat = saveFormat; + this.pageId = pageId; } /** diff --git a/src/main/java/com/mindee/image/ExtractedImages.java b/src/main/java/com/mindee/image/ExtractedImages.java new file mode 100644 index 000000000..2da54231f --- /dev/null +++ b/src/main/java/com/mindee/image/ExtractedImages.java @@ -0,0 +1,17 @@ +package com.mindee.image; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; + +public class ExtractedImages extends ArrayList { + public void saveAllToDisk(String outputPath) throws IOException { + saveAllToDisk(Path.of(outputPath)); + } + + public void saveAllToDisk(Path outputPath) throws IOException { + for (ExtractedImage image : this) { + image.writeToFile(outputPath); + } + } +} diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index e133eda7e..3cf0c6804 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -25,16 +25,18 @@ public class ImageExtractor { private final String saveFormat; public ImageExtractor(LocalInputSource source) throws IOException { - this.filename = source.getFilename(); + this.pageImages = new ArrayList<>(); if (source.isPDF()) { this.saveFormat = "jpg"; - var pdfPageImages = pdfToImages(source.getFile(), this.filename); + var pdfPageImages = pdfToImages(source.getFile(), source.getFilename()); for (PDFPageImage pdfPageImage : pdfPageImages) { this.pageImages.add(pdfPageImage.getImage()); } + this.filename = source.getFilename() + "." + this.saveFormat; } else { + this.filename = source.getFilename(); String[] splitName = InputSourceUtils.splitNameStrict(this.filename); this.saveFormat = splitName[1].toLowerCase(); @@ -43,7 +45,7 @@ public ImageExtractor(LocalInputSource source) throws IOException { } } - public List pdfToImages(byte[] fileBytes, String filename) throws IOException { + private List pdfToImages(byte[] fileBytes, String filename) throws IOException { PDDocument document = Loader.loadPDF(fileBytes); var pdfRenderer = new PDFRenderer(document); List pdfPageImages = new ArrayList<>(); @@ -90,7 +92,7 @@ public int getPageCount() { * @param pageIndex The page index to extract, begins at 0. * @return A list of {@link ExtractedImage}. */ - public List extractImagesFromPage( + public ExtractedImages extractImagesFromPage( List fields, int pageIndex ) { @@ -106,7 +108,7 @@ public List extractImagesFrom * @param outputName The base output filename, must have an image extension. * @return A list of {@link ExtractedImage}. */ - public List extractImagesFromPage( + public ExtractedImages extractImagesFromPage( List fields, int pageIndex, String outputName @@ -121,7 +123,7 @@ public List extractImagesFrom return extractFromPage(fields, pageIndex, filename); } - private List extractFromPage( + private ExtractedImages extractFromPage( List fields, int pageIndex, String outputName @@ -131,7 +133,7 @@ private List extractFromPage( .format("%s_page-%3s.%s", splitName[0], pageIndex + 1, splitName[1]) .replace(" ", "0"); - var extractedImages = new ArrayList(); + var extractedImages = new ExtractedImages(); for (int i = 0; i < fields.size(); i++) { ExtractedImage extractedImage = extractImage(fields.get(i), pageIndex, i + 1, filename); if (extractedImage != null) { @@ -171,7 +173,8 @@ public ExtractedImage extractImage( return new ExtractedImage( extractImage(polygon.getAsBbox(), pageIndex), fieldFilename, - saveFormat + saveFormat, + pageIndex ); } diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java index d7263d803..f1564e515 100644 --- a/src/main/java/com/mindee/pdf/BasePDFExtractor.java +++ b/src/main/java/com/mindee/pdf/BasePDFExtractor.java @@ -7,7 +7,6 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.ArrayList; import java.util.List; import javax.imageio.ImageIO; import org.apache.pdfbox.Loader; @@ -56,17 +55,15 @@ public BasePDFExtractor(LocalInputSource source) throws IOException { } } - /** - * Converts an array to a buffered image. - * - * @param byteArray Raw byte array. - * @return a valid ImageIO buffer. - * @throws IOException Throws if the file can't be accessed. - */ - public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException { - try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) { - return ImageIO.read(stream); + public ExtractedPDF extractSinglePage( + List pageNumbers, + boolean closeOriginal + ) throws IOException { + if (pageNumbers.isEmpty()) { + throw new MindeeException("Empty indexes not allowed for extraction."); } + var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal); + return new ExtractedPDF(pdfBytes, makeFilename(pageNumbers)); } /** @@ -76,32 +73,41 @@ public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IO * @return A list of extracted files. * @throws IOException Throws if the file can't be accessed. */ - public List extractSubDocuments( - List> pageIndexes - ) throws IOException { - var extractedPDFs = new ArrayList(); + public ExtractedPDFs extractSubDocuments(List> pageIndexes) throws IOException { + var extractedPDFs = new ExtractedPDFs(); for (List pageIndexElement : pageIndexes) { - if (pageIndexElement.isEmpty()) { - throw new MindeeException("Empty indexes not allowed for extraction."); - } - String[] splitName = InputSourceUtils.splitNameStrict(filename); - String fieldFilename = splitName[0] - + String.format("_%3s", pageIndexElement.get(0) + 1).replace(" ", "0") - + "-" - + String - .format("%3s", pageIndexElement.get(pageIndexElement.size() - 1) + 1) - .replace(" ", "0") - + "." - + splitName[1]; - extractedPDFs - .add( - new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename) - ); + extractedPDFs.add(extractSinglePage(pageIndexElement, false)); } return extractedPDFs; } + /** + * Converts an array to a buffered image. + * + * @param byteArray Raw byte array. + * @return a valid ImageIO buffer. + * @throws IOException Throws if the file can't be accessed. + */ + private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException { + try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) { + return ImageIO.read(stream); + } + } + + /** + * Make a nice filename for the split. + */ + private String makeFilename(List pageNumbers) { + String[] splitName = InputSourceUtils.splitNameStrict(filename); + return splitName[0] + + String.format("_%3s", pageNumbers.get(0)).replace(" ", "0") + + "-" + + String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0") + + "." + + splitName[1]; + } + private static PDPage clonePage(PDPage page) { COSDictionary pageDict = page.getCOSObject(); @@ -135,12 +141,4 @@ private static byte[] createPdfFromExistingPdf( outputStream.close(); return output; } - - public byte[] mergePdfPages( - PDDocument document, - List pageNumbers, - boolean closeOriginal - ) throws IOException { - return createPdfFromExistingPdf(document, pageNumbers, closeOriginal); - } } diff --git a/src/main/java/com/mindee/pdf/ExtractedPDFs.java b/src/main/java/com/mindee/pdf/ExtractedPDFs.java new file mode 100644 index 000000000..fb36467d1 --- /dev/null +++ b/src/main/java/com/mindee/pdf/ExtractedPDFs.java @@ -0,0 +1,6 @@ +package com.mindee.pdf; + +import java.util.ArrayList; + +public class ExtractedPDFs extends ArrayList { +} diff --git a/src/main/java/com/mindee/v2/fileOperations/Crop.java b/src/main/java/com/mindee/v2/fileOperations/Crop.java new file mode 100644 index 000000000..d9f8ef6df --- /dev/null +++ b/src/main/java/com/mindee/v2/fileOperations/Crop.java @@ -0,0 +1,35 @@ +package com.mindee.v2.fileOperations; + +import com.mindee.image.ExtractedImage; +import com.mindee.image.ExtractedImages; +import com.mindee.image.ImageExtractor; +import com.mindee.input.LocalInputSource; +import com.mindee.v2.product.crop.CropItem; +import java.io.IOException; +import java.util.List; + +public class Crop { + private final ImageExtractor imageExtractor; + + public Crop(LocalInputSource inputSource) throws IOException { + this.imageExtractor = new ImageExtractor(inputSource); + } + + public ExtractedImage extractSingle(CropItem cropItem) throws IOException { + return this.imageExtractor + .extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), 0); + } + + public ExtractedImages extractMultiple(List cropItems) { + var extractedImages = new ExtractedImages(); + for (int i = 0; i < cropItems.size(); i++) { + var cropItem = cropItems.get(i); + extractedImages + .add( + this.imageExtractor + .extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), i + 1) + ); + } + return extractedImages; + } +} diff --git a/src/main/java/com/mindee/v2/fileOperations/Split.java b/src/main/java/com/mindee/v2/fileOperations/Split.java new file mode 100644 index 000000000..00b1174c4 --- /dev/null +++ b/src/main/java/com/mindee/v2/fileOperations/Split.java @@ -0,0 +1,29 @@ +package com.mindee.v2.fileOperations; + +import com.mindee.input.LocalInputSource; +import com.mindee.pdf.BasePDFExtractor; +import com.mindee.pdf.ExtractedPDF; +import com.mindee.pdf.ExtractedPDFs; +import com.mindee.v2.product.split.SplitRange; +import java.io.IOException; +import java.util.ArrayList; +import java.util.stream.Collectors; + +public class Split { + BasePDFExtractor pdfSplitter; + + public Split(LocalInputSource inputSource) throws IOException { + this.pdfSplitter = new BasePDFExtractor(inputSource); + } + + public ExtractedPDF extractSingle(SplitRange splitRange) throws IOException { + return this.pdfSplitter.extractSinglePage(splitRange.getPageRangeDistinct(), true); + } + + public ExtractedPDFs extractMultiple(ArrayList splitRanges) throws IOException { + return this.pdfSplitter + .extractSubDocuments( + splitRanges.stream().map(SplitRange::getPageRangeDistinct).collect(Collectors.toList()) + ); + } +} diff --git a/src/main/java/com/mindee/v2/product/split/SplitRange.java b/src/main/java/com/mindee/v2/product/split/SplitRange.java index 254805204..d8487a64f 100644 --- a/src/main/java/com/mindee/v2/product/split/SplitRange.java +++ b/src/main/java/com/mindee/v2/product/split/SplitRange.java @@ -3,6 +3,8 @@ import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; import lombok.Getter; @@ -22,11 +24,18 @@ public class SplitRange { * indicates the end page. */ @JsonProperty("page_range") - public ArrayList pageRange; + public List pageRange; /** * The document type, as identified on given classification values. */ @JsonProperty("document_type") public String documentType; + + /** + * Returns a list of distinct page ranges. Useful for extracting pages from a document. + */ + public List getPageRangeDistinct() { + return new ArrayList<>(new LinkedHashSet<>(this.pageRange)); + } } diff --git a/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java b/src/test/java/com/mindee/v1/fileOperations/InvoiceSplitterAutoExtractionIT.java similarity index 97% rename from src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java rename to src/test/java/com/mindee/v1/fileOperations/InvoiceSplitterAutoExtractionIT.java index dd33c1663..ac8ceeded 100644 --- a/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java +++ b/src/test/java/com/mindee/v1/fileOperations/InvoiceSplitterAutoExtractionIT.java @@ -1,4 +1,4 @@ -package com.mindee.v1.fileOperation; +package com.mindee.v1.fileOperations; import static com.mindee.TestingUtilities.getV1ResourcePath; import static com.mindee.TestingUtilities.levenshteinRatio; @@ -71,8 +71,8 @@ public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedExc List extractedPDFsStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), false); Assertions.assertEquals(2, extractedPDFsStrict.size()); - Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(0).getFilename()); - Assertions.assertEquals("default_sample_002-002.pdf", extractedPDFsStrict.get(1).getFilename()); + Assertions.assertEquals("default_sample_000-000.pdf", extractedPDFsStrict.get(0).getFilename()); + Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(1).getFilename()); PredictResponse invoice0 = getInvoicePrediction( extractedPDFsStrict.get(0).asInputSource() diff --git a/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java b/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java index e68a3d52e..16de8338e 100644 --- a/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java +++ b/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java @@ -31,9 +31,9 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException { var extractedPDFSNoStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), false); Assertions.assertEquals(3, extractedPDFSNoStrict.size()); - Assertions.assertEquals("invoice_5p_001-001.pdf", extractedPDFSNoStrict.get(0).getFilename()); - Assertions.assertEquals("invoice_5p_002-004.pdf", extractedPDFSNoStrict.get(1).getFilename()); - Assertions.assertEquals("invoice_5p_005-005.pdf", extractedPDFSNoStrict.get(2).getFilename()); + Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFSNoStrict.get(0).getFilename()); + Assertions.assertEquals("invoice_5p_001-003.pdf", extractedPDFSNoStrict.get(1).getFilename()); + Assertions.assertEquals("invoice_5p_004-004.pdf", extractedPDFSNoStrict.get(2).getFilename()); } @Test @@ -48,7 +48,7 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException { var extractedPDFStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), true); Assertions.assertEquals(2, extractedPDFStrict.size()); - Assertions.assertEquals("invoice_5p_001-001.pdf", extractedPDFStrict.get(0).getFilename()); - Assertions.assertEquals("invoice_5p_002-005.pdf", extractedPDFStrict.get(1).getFilename()); + Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFStrict.get(0).getFilename()); + Assertions.assertEquals("invoice_5p_001-004.pdf", extractedPDFStrict.get(1).getFilename()); } } diff --git a/src/test/java/com/mindee/v2/fileOperations/CropTest.java b/src/test/java/com/mindee/v2/fileOperations/CropTest.java new file mode 100644 index 000000000..44796029d --- /dev/null +++ b/src/test/java/com/mindee/v2/fileOperations/CropTest.java @@ -0,0 +1,83 @@ +package com.mindee.v2.fileOperations; + +import static com.mindee.TestingUtilities.getResourcePath; +import static com.mindee.TestingUtilities.getV2ResourcePath; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.mindee.input.LocalInputSource; +import com.mindee.v2.parsing.LocalResponse; +import com.mindee.v2.product.crop.CropResponse; +import java.nio.file.Files; +import org.junit.jupiter.api.Test; + +class CropTest { + @Test + void singlePageSingleCrop_cropsCorrectly() throws Exception { + var inputSample = new LocalInputSource(getV2ResourcePath("products/crop/default_sample.jpg")); + var localResponse = new LocalResponse(getV2ResourcePath("products/crop/default_sample.json")); + var doc = localResponse.deserializeResponse(CropResponse.class); + + var extractedCrop = new Crop(inputSample) + .extractSingle(doc.getInference().getResult().getCrops().get(0)); + + assertEquals(0, extractedCrop.getPageId()); + assertEquals("default_sample_000.jpg", extractedCrop.getFilename()); + + assertEquals(1056, extractedCrop.getImage().getWidth()); + assertEquals(2070, extractedCrop.getImage().getHeight()); + } + + @Test + void singlePageMultiCrop_cropsCorrectly() throws Exception { + var inputSample = new LocalInputSource(getV2ResourcePath("products/crop/default_sample.jpg")); + var localResponse = new LocalResponse(getV2ResourcePath("products/crop/default_sample.json")); + var doc = localResponse.deserializeResponse(CropResponse.class); + + var extractedCrops = new Crop(inputSample) + .extractMultiple(doc.getInference().getResult().getCrops()); + + assertEquals(2, extractedCrops.size()); + + var crop0 = extractedCrops.get(0); + assertEquals(0, crop0.getPageId()); + assertEquals("default_sample_001.jpg", crop0.getFilename()); + + assertEquals(1056, crop0.getImage().getWidth()); + assertEquals(2070, crop0.getImage().getHeight()); + + var outputPath = getResourcePath("output"); + extractedCrops.saveAllToDisk(outputPath); + assertTrue(Files.exists(outputPath.resolve("default_sample_001.jpg"))); + assertTrue(Files.exists(outputPath.resolve("default_sample_002.jpg"))); + } + + @Test + void multiPageMultiCrop_cropsCorrectly() throws Exception { + var inputSample = new LocalInputSource(getV2ResourcePath("products/crop/multipage_sample.pdf")); + var localResponse = new LocalResponse(getV2ResourcePath("products/crop/multipage_sample.json")); + var doc = localResponse.deserializeResponse(CropResponse.class); + + var extractedCrops = new Crop(inputSample) + .extractMultiple(doc.getInference().getResult().getCrops()); + + assertEquals(5, extractedCrops.size()); + + var crop0 = extractedCrops.get(0); + assertEquals(0, crop0.getPageId()); + assertEquals("multipage_sample.pdf_001.jpg", crop0.getFilename()); + assertEquals(555, crop0.getImage().getWidth()); + assertEquals(1533, crop0.getImage().getHeight()); + + var crop3 = extractedCrops.get(3); + assertEquals(1, crop3.getPageId()); + assertEquals("multipage_sample.pdf_004.jpg", crop3.getFilename()); + assertEquals(562, crop3.getImage().getWidth()); + assertEquals(974, crop3.getImage().getHeight()); + + var outputPath = getResourcePath("output"); + extractedCrops.saveAllToDisk(outputPath); + assertTrue(Files.exists(outputPath.resolve("multipage_sample.pdf_001.jpg"))); + assertTrue(Files.exists(outputPath.resolve("multipage_sample.pdf_005.jpg"))); + } +} diff --git a/src/test/java/com/mindee/v2/fileOperations/SplitTest.java b/src/test/java/com/mindee/v2/fileOperations/SplitTest.java new file mode 100644 index 000000000..2ba33cd77 --- /dev/null +++ b/src/test/java/com/mindee/v2/fileOperations/SplitTest.java @@ -0,0 +1,50 @@ +package com.mindee.v2.fileOperations; + +import static com.mindee.TestingUtilities.getV2ResourcePath; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import com.mindee.input.LocalInputSource; +import com.mindee.v2.parsing.LocalResponse; +import com.mindee.v2.product.split.SplitResponse; +import java.io.IOException; +import org.junit.jupiter.api.Test; + +public class SplitTest { + @Test + void singlePage_splitsCorrectly() throws IOException { + var inputSample = new LocalInputSource(getV2ResourcePath("products/split/default_sample.pdf")); + assertEquals(2, inputSample.getPageCount()); + var localResponse = new LocalResponse(getV2ResourcePath("products/split/default_sample.json")); + var doc = localResponse.deserializeResponse(SplitResponse.class); + + var extractedSplit = new Split(inputSample) + .extractSingle(doc.getInference().getResult().getSplits().get(0)); + + assertEquals("default_sample_000-000.pdf", extractedSplit.getFilename()); + var asInputSource = extractedSplit.asInputSource(); + assertEquals(1, asInputSource.getPageCount()); + } + + @Test + void multiplePages_splitsCorrectly() throws IOException { + var inputSample = new LocalInputSource(getV2ResourcePath("products/split/default_sample.pdf")); + assertEquals(2, inputSample.getPageCount()); + var localResponse = new LocalResponse(getV2ResourcePath("products/split/default_sample.json")); + var doc = localResponse.deserializeResponse(SplitResponse.class); + + var extractedSplits = new Split(inputSample) + .extractMultiple(doc.getInference().getResult().getSplits()); + + assertEquals(2, extractedSplits.size()); + + var split0 = extractedSplits.get(0); + assertEquals("default_sample_000-000.pdf", split0.getFilename()); + var asInputSource0 = split0.asInputSource(); + assertEquals(1, asInputSource0.getPageCount()); + + var split1 = extractedSplits.get(1); + assertEquals("default_sample_001-001.pdf", split1.getFilename()); + var asInputSource1 = split1.asInputSource(); + assertEquals(1, asInputSource1.getPageCount()); + } +} diff --git a/src/test/resources b/src/test/resources index 53f0efbc0..4cec007c9 160000 --- a/src/test/resources +++ b/src/test/resources @@ -1 +1 @@ -Subproject commit 53f0efbc08c77c2c085aadd27de9d2d6c359276e +Subproject commit 4cec007c9b9ec7a9f0399fa900914fa51a47308c