From bbe4aceb0a6c6d31ac4ba3899eee2cfca350e09b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Tue, 28 Apr 2026 10:59:39 +0200 Subject: [PATCH 1/4] again with the PDF interfaces --- .../java/com/mindee/image/ImageExtractor.java | 4 +- .../com/mindee/input/InputSourceUtils.java | 48 ---------- .../com/mindee/input/LocalInputSource.java | 10 +- .../java/com/mindee/pdf/BasePDFExtractor.java | 2 +- .../mindee/pdf/ExtractionPDFOperation.java | 26 ++++++ .../mindee/pdf/InputSourcePDFOperation.java | 37 ++++++++ src/main/java/com/mindee/pdf/PDFBoxApi.java | 92 ++++++++++++++----- .../java/com/mindee/pdf/PDFOperation.java | 44 --------- .../java/com/mindee/pdf/PDFOperationTest.java | 69 +++++++------- 9 files changed, 174 insertions(+), 158 deletions(-) create mode 100644 src/main/java/com/mindee/pdf/ExtractionPDFOperation.java create mode 100644 src/main/java/com/mindee/pdf/InputSourcePDFOperation.java delete mode 100644 src/main/java/com/mindee/pdf/PDFOperation.java diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index 3309e160a..aca307b6c 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -5,7 +5,7 @@ import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; import com.mindee.pdf.PDFBoxApi; -import com.mindee.pdf.PDFOperation; +import com.mindee.pdf.ExtractionPDFOperation; import com.mindee.pdf.PdfPageImage; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; @@ -22,7 +22,7 @@ public class ImageExtractor { private final String filename; private final String saveFormat; - public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException { + public ImageExtractor(LocalInputSource source, ExtractionPDFOperation pdfOperation) throws IOException { this.filename = source.getFilename(); this.pageImages = new ArrayList<>(); diff --git a/src/main/java/com/mindee/input/InputSourceUtils.java b/src/main/java/com/mindee/input/InputSourceUtils.java index 076320b56..f6d86877d 100644 --- a/src/main/java/com/mindee/input/InputSourceUtils.java +++ b/src/main/java/com/mindee/input/InputSourceUtils.java @@ -1,12 +1,6 @@ package com.mindee.input; import com.mindee.MindeeException; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.RandomAccessReadBuffer; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.text.PDFTextStripper; /** * Utilities for working with files. @@ -65,46 +59,4 @@ public static String[] splitNameStrict(String filename) throws MindeeException { } return new String[] { name, extension }; } - - /** - * Returns true if the file is a PDF. - */ - public static boolean isPdf(byte[] fileBytes) { - try { - Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); - } catch (IOException e) { - return false; - } - return true; - } - - /** - * Returns true if the source PDF has source text inside. Returns false for images. - * - * @param fileBytes A byte array representing a PDF. - * @return True if at least one character exists in one page. - * @throws MindeeException if the file could not be read. - */ - public static boolean hasSourceText(byte[] fileBytes) { - try { - PDDocument document = Loader - .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); - PDFTextStripper stripper = new PDFTextStripper(); - - for (int i = 0; i < document.getNumberOfPages(); i++) { - stripper.setStartPage(i + 1); - stripper.setEndPage(i + 1); - String pageText = stripper.getText(document); - if (!pageText.trim().isEmpty()) { - document.close(); - return true; - } - } - document.close(); - } catch (IOException e) { - return false; - } - - return false; - } } diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index 32beff36b..e9314fbd2 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -1,9 +1,9 @@ package com.mindee.input; import com.mindee.image.ImageCompressor; +import com.mindee.pdf.InputSourcePDFOperation; import com.mindee.pdf.PDFBoxApi; import com.mindee.pdf.PDFCompressor; -import com.mindee.pdf.PDFOperation; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -24,7 +24,7 @@ public final class LocalInputSource { @Getter private final String filename; @Setter - private PDFOperation pdfOperation; + private InputSourcePDFOperation pdfOperation; public LocalInputSource(InputStream file, String filename) throws IOException { this.file = IOUtils.toByteArray(file); @@ -57,7 +57,7 @@ public LocalInputSource(String fileAsBase64, String filename) { this.filename = filename; } - public PDFOperation getPdfOperation() { + public InputSourcePDFOperation getPdfOperation() { if (this.pdfOperation == null) { this.pdfOperation = new PDFBoxApi(); } @@ -90,11 +90,11 @@ public void applyPageOptions(PageOptions pageOptions) throws IOException { } public boolean isPdf() { - return InputSourceUtils.isPdf(this.file); + return getPdfOperation().isPdf(this.file); } public boolean hasSourceText() { - return InputSourceUtils.hasSourceText(this.file); + return getPdfOperation().hasSourceText(this.file); } public void compress( diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java index f9b6b21a8..8cb2f7070 100644 --- a/src/main/java/com/mindee/pdf/BasePDFExtractor.java +++ b/src/main/java/com/mindee/pdf/BasePDFExtractor.java @@ -157,7 +157,7 @@ private static byte[] createPdfFromExistingPdf( */ public static byte[] mergePdfPages(File file, List pageNumbers) throws IOException { PDDocument document = Loader.loadPDF(file); - return createPdfFromExistingPdf(document, pageNumbers, true); + return mergePdfPages(document, pageNumbers, true); } public static byte[] mergePdfPages( diff --git a/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java b/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java new file mode 100644 index 000000000..022174dfa --- /dev/null +++ b/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java @@ -0,0 +1,26 @@ +package com.mindee.pdf; + +import com.mindee.input.LocalInputSource; + +import java.io.IOException; +import java.util.List; + +public interface ExtractionPDFOperation { +// /** +// * Render a single page of a PDF as an image. +// */ +// PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException; +// +// default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException { +// return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber); +// } + + /** + * Render all pages of a PDF as images. + */ + List pdfToImages(byte[] fileBytes, String filename) throws IOException; + + default List pdfToImages(LocalInputSource source) throws IOException { + return pdfToImages(source.getFile(), source.getFilename()); + } +} diff --git a/src/main/java/com/mindee/pdf/InputSourcePDFOperation.java b/src/main/java/com/mindee/pdf/InputSourcePDFOperation.java new file mode 100644 index 000000000..9c83a7eed --- /dev/null +++ b/src/main/java/com/mindee/pdf/InputSourcePDFOperation.java @@ -0,0 +1,37 @@ +package com.mindee.pdf; + +import com.mindee.MindeeException; +import com.mindee.input.LocalInputSource; +import com.mindee.input.PageOptions; +import java.io.IOException; + +public interface InputSourcePDFOperation { + + /** + * Split a PDF file. + */ + SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; + + /** + * Get the number of pages in a PDF file. + */ + int getNumberOfPages(byte[] fileBytes) throws IOException; + + default int getNumberOfPages(LocalInputSource inputSource) throws IOException { + return getNumberOfPages(inputSource.getFile()); + } + + /** + * Returns true if the file is a PDF. + */ + boolean isPdf(byte[] fileBytes); + + /** + * Returns true if the source PDF has source text inside. Returns false for images. + * + * @param fileBytes A byte array representing a PDF. + * @return True if at least one character exists in one page. + * @throws MindeeException if the file could not be read. + */ + boolean hasSourceText(byte[] fileBytes); +} diff --git a/src/main/java/com/mindee/pdf/PDFBoxApi.java b/src/main/java/com/mindee/pdf/PDFBoxApi.java index 6afcb8c69..fe41fc23c 100644 --- a/src/main/java/com/mindee/pdf/PDFBoxApi.java +++ b/src/main/java/com/mindee/pdf/PDFBoxApi.java @@ -3,6 +3,7 @@ import com.mindee.MindeeException; import com.mindee.input.PageOptions; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; @@ -13,15 +14,17 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; /** * Allows performing various operations on PDFs. */ -public final class PDFBoxApi implements PDFOperation { +public final class PDFBoxApi implements InputSourcePDFOperation { @Override public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException { @@ -61,33 +64,76 @@ public int getNumberOfPages(byte[] fileBytes) throws IOException { return pageCount; } + /** + * Returns true if the file is a PDF. + */ @Override - public PdfPageImage pdfPageToImage( - byte[] fileBytes, - String filename, - int pageNumber - ) throws IOException { - int index = pageNumber - 1; - PDDocument document = Loader.loadPDF(fileBytes); - var pdfRenderer = new PDFRenderer(document); - BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); - document.close(); - return new PdfPageImage(imageBuffer, index, filename, "jpg"); + public boolean isPdf(byte[] fileBytes) { + try { + Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); + } catch (IOException e) { + return false; + } + return true; } + /** + * Returns true if the source PDF has source text inside. Returns false for images. + * + * @param fileBytes A byte array representing a PDF. + * @return True if at least one character exists in one page. + * @throws MindeeException if the file could not be read. + */ @Override - public List pdfToImages(byte[] fileBytes, String filename) throws IOException { - PDDocument document = Loader.loadPDF(fileBytes); - var pdfRenderer = new PDFRenderer(document); - List pdfPageImages = new ArrayList<>(); - for (int i = 0; i < document.getNumberOfPages(); i++) { - var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); - pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); + public boolean hasSourceText(byte[] fileBytes) { + try { + PDDocument document = Loader + .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); + PDFTextStripper stripper = new PDFTextStripper(); + + for (int i = 0; i < document.getNumberOfPages(); i++) { + stripper.setStartPage(i + 1); + stripper.setEndPage(i + 1); + String pageText = stripper.getText(document); + if (!pageText.trim().isEmpty()) { + document.close(); + return true; + } + } + document.close(); + } catch (IOException e) { + return false; } - document.close(); - return pdfPageImages; + return false; } +// @Override +// public PdfPageImage pdfPageToImage( +// byte[] fileBytes, +// String filename, +// int pageNumber +// ) throws IOException { +// int index = pageNumber - 1; +// PDDocument document = Loader.loadPDF(fileBytes); +// var pdfRenderer = new PDFRenderer(document); +// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); +// document.close(); +// return new PdfPageImage(imageBuffer, index, filename, "jpg"); +// } + +// @Override +// public List pdfToImages(byte[] fileBytes, String filename) throws IOException { +// PDDocument document = Loader.loadPDF(fileBytes); +// var pdfRenderer = new PDFRenderer(document); +// List pdfPageImages = new ArrayList<>(); +// for (int i = 0; i < document.getNumberOfPages(); i++) { +// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); +// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); +// } +// document.close(); +// return pdfPageImages; +// } + private BufferedImage pdfPageToImageBuffer( int index, PDDocument document, @@ -128,10 +174,10 @@ private List getPageRanges(PageOptions pageOptions, Integer numberOfPag } } - private boolean checkPdfOpen(byte[] documentFile) { + private boolean checkPdfOpen(byte[] fileBytes) { boolean opens = false; try { - Loader.loadPDF(documentFile).close(); + Loader.loadPDF(fileBytes).close(); opens = true; } catch (IOException e) { e.printStackTrace(); diff --git a/src/main/java/com/mindee/pdf/PDFOperation.java b/src/main/java/com/mindee/pdf/PDFOperation.java deleted file mode 100644 index e9c0af58a..000000000 --- a/src/main/java/com/mindee/pdf/PDFOperation.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.mindee.pdf; - -import com.mindee.input.LocalInputSource; -import com.mindee.input.PageOptions; -import java.io.IOException; -import java.util.List; - -/** - * Minimum PDF operations. - */ -public interface PDFOperation { - - /** - * Split a PDF file. - */ - SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; - - /** - * Get the number of pages in a PDF file. - */ - int getNumberOfPages(byte[] fileBytes) throws IOException; - - default int getNumberOfPages(LocalInputSource inputSource) throws IOException { - return getNumberOfPages(inputSource.getFile()); - } - - /** - * Render a single page of a PDF as an image. - */ - PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException; - - default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException { - return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber); - } - - /** - * Render all pages of a PDF as images. - */ - List pdfToImages(byte[] fileBytes, String filename) throws IOException; - - default List pdfToImages(LocalInputSource source) throws IOException { - return pdfToImages(source.getFile(), source.getFilename()); - } -} diff --git a/src/test/java/com/mindee/pdf/PDFOperationTest.java b/src/test/java/com/mindee/pdf/PDFOperationTest.java index 9cb08277c..62ff58150 100644 --- a/src/test/java/com/mindee/pdf/PDFOperationTest.java +++ b/src/test/java/com/mindee/pdf/PDFOperationTest.java @@ -9,7 +9,6 @@ import java.io.File; import java.io.IOException; import java.nio.file.Files; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -20,40 +19,40 @@ public class PDFOperationTest { - private final PDFOperation pdfOperation = new PDFBoxApi(); - - @Test - public void shouldConvertSinglePageToJpg() throws IOException { - LocalInputSource source = new LocalInputSource( - "src/test/resources/file_types/pdf/multipage.pdf" - ); - PdfPageImage pdfPageImage = pdfOperation.pdfPageToImage(source, 3); - Assertions.assertNotNull(pdfPageImage.getImage()); - Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); - pdfPageImage.writeToFile("src/test/resources/output/"); - Assertions - .assertTrue( - Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) - ); - } - - @Test - public void shouldConvertAllPagesToJpg() throws IOException { - LocalInputSource source = new LocalInputSource( - "src/test/resources/file_types/pdf/multipage.pdf" - ); - List pdfPageImages = pdfOperation.pdfToImages(source); - for (PdfPageImage pdfPageImage : pdfPageImages) { - Assertions.assertNotNull(pdfPageImage.getImage()); - Assertions - .assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); - pdfPageImage.writeToFile("src/test/resources/output/"); - Assertions - .assertTrue( - Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) - ); - } - } + private final InputSourcePDFOperation pdfOperation = new PDFBoxApi(); + +// @Test +// public void shouldConvertSinglePageToJpg() throws IOException { +// LocalInputSource source = new LocalInputSource( +// "src/test/resources/file_types/pdf/multipage.pdf" +// ); +// PdfPageImage pdfPageImage = pdfOperation.pdfPageToImage(source, 3); +// Assertions.assertNotNull(pdfPageImage.getImage()); +// Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); +// pdfPageImage.writeToFile("src/test/resources/output/"); +// Assertions +// .assertTrue( +// Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) +// ); +// } + +// @Test +// public void shouldConvertAllPagesToJpg() throws IOException { +// LocalInputSource source = new LocalInputSource( +// "src/test/resources/file_types/pdf/multipage.pdf" +// ); +// List pdfPageImages = pdfOperation.pdfToImages(source); +// for (PdfPageImage pdfPageImage : pdfPageImages) { +// Assertions.assertNotNull(pdfPageImage.getImage()); +// Assertions +// .assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); +// pdfPageImage.writeToFile("src/test/resources/output/"); +// Assertions +// .assertTrue( +// Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) +// ); +// } +// } @Test public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws IOException { From 4dc08e0892353328f3aa859d44a78e19ff2a6a8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?ianar=C3=A9=20s=C3=A9vi?= Date: Tue, 28 Apr 2026 15:15:27 +0200 Subject: [PATCH 2/4] something something dark side --- .../java/com/mindee/image/ImageExtractor.java | 47 ++++++++++---- .../com/mindee/input/LocalInputSource.java | 58 ++++++++++++----- .../mindee/pdf/ExtractionPDFOperation.java | 26 -------- .../java/com/mindee/pdf/PDFCompression.java | 28 ++++++++ .../java/com/mindee/pdf/PDFCompressor.java | 34 ++++------ .../java/com/mindee/pdf/PDFExtraction.java | 38 +++++++++++ ...DFExtractor.java => PDFExtractorBase.java} | 64 ++++++++++++++++--- ...ePDFOperation.java => PDFInputSource.java} | 10 ++- .../{PDFBoxApi.java => PDFInputSourcer.java} | 29 +-------- src/main/java/com/mindee/v1/MindeeClient.java | 23 +++---- .../java/com/mindee/v1/pdf/PDFExtractor.java | 4 +- .../com/mindee/input/FileCompressionTest.java | 20 +++--- .../java/com/mindee/pdf/PDFOperationTest.java | 2 +- 13 files changed, 239 insertions(+), 144 deletions(-) delete mode 100644 src/main/java/com/mindee/pdf/ExtractionPDFOperation.java create mode 100644 src/main/java/com/mindee/pdf/PDFCompression.java create mode 100644 src/main/java/com/mindee/pdf/PDFExtraction.java rename src/main/java/com/mindee/pdf/{BasePDFExtractor.java => PDFExtractorBase.java} (71%) rename src/main/java/com/mindee/pdf/{InputSourcePDFOperation.java => PDFInputSource.java} (80%) rename src/main/java/com/mindee/pdf/{PDFBoxApi.java => PDFInputSourcer.java} (81%) diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index aca307b6c..077bc6e0c 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -4,8 +4,6 @@ import com.mindee.geometry.PositionDataField; import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; -import com.mindee.pdf.PDFBoxApi; -import com.mindee.pdf.ExtractionPDFOperation; import com.mindee.pdf.PdfPageImage; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; @@ -13,6 +11,11 @@ import java.util.ArrayList; import java.util.List; import javax.imageio.ImageIO; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; /** * Extract sub-images from an image. @@ -22,13 +25,13 @@ public class ImageExtractor { private final String filename; private final String saveFormat; - public ImageExtractor(LocalInputSource source, ExtractionPDFOperation pdfOperation) throws IOException { + public ImageExtractor(LocalInputSource source) throws IOException { this.filename = source.getFilename(); this.pageImages = new ArrayList<>(); if (source.isPdf()) { this.saveFormat = "jpg"; - var pdfPageImages = pdfOperation.pdfToImages(source); + var pdfPageImages = pdfToImages(source.getFile(), this.filename); for (PdfPageImage pdfPageImage : pdfPageImages) { this.pageImages.add(pdfPageImage.getImage()); } @@ -41,14 +44,34 @@ public ImageExtractor(LocalInputSource source, ExtractionPDFOperation pdfOperati } } - /** - * Init from a {@link LocalInputSource}. - * - * @param source The local source. - * @throws IOException Throws if the file can't be accessed. - */ - public ImageExtractor(LocalInputSource source) throws IOException { - this(source, new PDFBoxApi()); + public List pdfToImages(byte[] fileBytes, String filename) throws IOException { + PDDocument document = Loader.loadPDF(fileBytes); + var pdfRenderer = new PDFRenderer(document); + List pdfPageImages = new ArrayList<>(); + for (int i = 0; i < document.getNumberOfPages(); i++) { + var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); + pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); + } + document.close(); + return pdfPageImages; + } + + private BufferedImage pdfPageToImageBuffer( + int index, + PDDocument document, + PDFRenderer pdfRenderer + ) throws IOException { + PDRectangle bbox = document.getPage(index).getBBox(); + float dimension = bbox.getWidth() * bbox.getHeight(); + int dpi; + if (dimension < 200000) { + dpi = 300; + } else if (dimension < 300000) { + dpi = 250; + } else { + dpi = 200; + } + return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); } /** diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index e9314fbd2..06a53481a 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -1,9 +1,10 @@ package com.mindee.input; import com.mindee.image.ImageCompressor; -import com.mindee.pdf.InputSourcePDFOperation; -import com.mindee.pdf.PDFBoxApi; +import com.mindee.pdf.PDFCompression; import com.mindee.pdf.PDFCompressor; +import com.mindee.pdf.PDFInputSource; +import com.mindee.pdf.PDFInputSourcer; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -24,7 +25,9 @@ public final class LocalInputSource { @Getter private final String filename; @Setter - private InputSourcePDFOperation pdfOperation; + private PDFInputSource pdfOperation; + @Setter + private PDFCompressor pdfCompressor; public LocalInputSource(InputStream file, String filename) throws IOException { this.file = IOUtils.toByteArray(file); @@ -57,13 +60,20 @@ public LocalInputSource(String fileAsBase64, String filename) { this.filename = filename; } - public InputSourcePDFOperation getPdfOperation() { + public PDFInputSource getPdfOperation() { if (this.pdfOperation == null) { - this.pdfOperation = new PDFBoxApi(); + this.pdfOperation = new PDFInputSourcer(); } return this.pdfOperation; } + public PDFCompression getPdfCompressor() { + if (this.pdfCompressor == null) { + this.pdfCompressor = new PDFCompressor(); + } + return this.pdfCompressor; + } + /** * Get the number of pages in the document. * @@ -97,7 +107,7 @@ public boolean hasSourceText() { return getPdfOperation().hasSourceText(this.file); } - public void compress( + public LocalInputSource compress( Integer quality, Integer maxWidth, Integer maxHeight, @@ -105,34 +115,48 @@ public void compress( Boolean disableSourceText ) throws IOException { if (isPdf()) { - this.file = PDFCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText); + this.file = getPdfCompressor() + .compressPdf(this.file, quality, forceSourceText, disableSourceText); } else { this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight); } + return this; } - public void compress( + public LocalInputSource compress( Integer quality, Integer maxWidth, Integer maxHeight, Boolean forceSourceText ) throws IOException { - this.compress(quality, maxWidth, maxHeight, forceSourceText, true); + return this.compress(quality, maxWidth, maxHeight, forceSourceText, true); + } + + public LocalInputSource compress( + int quality, + boolean forceSourceText, + boolean disableSourceText + ) throws IOException { + return this.compress(quality, null, null, forceSourceText, disableSourceText); } - public void compress(Integer quality, Integer maxWidth, Integer maxHeight) throws IOException { - this.compress(quality, maxWidth, maxHeight, false, true); + public LocalInputSource compress( + Integer quality, + Integer maxWidth, + Integer maxHeight + ) throws IOException { + return this.compress(quality, maxWidth, maxHeight, false, true); } - public void compress(Integer quality, Integer maxWidth) throws IOException { - this.compress(quality, maxWidth, null, false, true); + public LocalInputSource compress(Integer quality, Integer maxWidth) throws IOException { + return this.compress(quality, maxWidth, null, false, true); } - public void compress(Integer quality) throws IOException { - this.compress(quality, null, null, false, true); + public LocalInputSource compress(Integer quality) throws IOException { + return this.compress(quality, null, null, false, true); } - public void compress() throws IOException { - this.compress(85, null, null, false, true); + public LocalInputSource compress() throws IOException { + return this.compress(85, null, null, false, true); } } diff --git a/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java b/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java deleted file mode 100644 index 022174dfa..000000000 --- a/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.mindee.pdf; - -import com.mindee.input.LocalInputSource; - -import java.io.IOException; -import java.util.List; - -public interface ExtractionPDFOperation { -// /** -// * Render a single page of a PDF as an image. -// */ -// PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException; -// -// default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException { -// return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber); -// } - - /** - * Render all pages of a PDF as images. - */ - List pdfToImages(byte[] fileBytes, String filename) throws IOException; - - default List pdfToImages(LocalInputSource source) throws IOException { - return pdfToImages(source.getFile(), source.getFilename()); - } -} diff --git a/src/main/java/com/mindee/pdf/PDFCompression.java b/src/main/java/com/mindee/pdf/PDFCompression.java new file mode 100644 index 000000000..7d8f6e2e5 --- /dev/null +++ b/src/main/java/com/mindee/pdf/PDFCompression.java @@ -0,0 +1,28 @@ +package com.mindee.pdf; + +import java.io.IOException; + +public interface PDFCompression { + byte[] compressPdf( + byte[] pdfData, + Integer imageQuality, + Boolean forceSourceTextCompression, + Boolean disableSourceText + ) throws IOException; + + default byte[] compressPdf( + byte[] pdfData, + Integer imageQuality, + Boolean forceSourceTextCompression + ) throws IOException { + return compressPdf(pdfData, imageQuality, forceSourceTextCompression, true); + } + + default byte[] compressPdf(byte[] pdfData, Integer imageQuality) throws IOException { + return compressPdf(pdfData, imageQuality, false, true); + } + + default byte[] compressPdf(byte[] pdfData) throws IOException { + return compressPdf(pdfData, 85, false, true); + } +} diff --git a/src/main/java/com/mindee/pdf/PDFCompressor.java b/src/main/java/com/mindee/pdf/PDFCompressor.java index ccf498eeb..65c88d99c 100644 --- a/src/main/java/com/mindee/pdf/PDFCompressor.java +++ b/src/main/java/com/mindee/pdf/PDFCompressor.java @@ -1,8 +1,5 @@ package com.mindee.pdf; -import static com.mindee.input.InputSourceUtils.hasSourceText; -import static com.mindee.input.InputSourceUtils.isPdf; - import java.awt.*; import java.awt.image.BufferedImage; import java.io.ByteArrayOutputStream; @@ -26,14 +23,21 @@ /** * PDF compression class. */ -public class PDFCompressor { - public static byte[] compressPdf( +public class PDFCompressor implements PDFCompression { + PDFInputSourcer pdfInputSourcer; + + public PDFCompressor() { + pdfInputSourcer = new PDFInputSourcer(); + } + + @Override + public byte[] compressPdf( byte[] pdfData, Integer imageQuality, Boolean forceSourceTextCompression, Boolean disableSourceText ) throws IOException { - if (!isPdf(pdfData)) { + if (!pdfInputSourcer.isPdf(pdfData)) { return pdfData; } @@ -43,7 +47,7 @@ public static byte[] compressPdf( if (disableSourceText == null) { disableSourceText = true; } - if (!forceSourceTextCompression && hasSourceText(pdfData)) { + if (!forceSourceTextCompression && pdfInputSourcer.hasSourceText(pdfData)) { System.out .println( "MINDEE WARNING: Found text inside of the provided PDF file. Compression operation aborted." @@ -75,22 +79,6 @@ public static byte[] compressPdf( } } - public static byte[] compressPdf( - byte[] pdfData, - Integer imageQuality, - Boolean forceSourceTextCompression - ) throws IOException { - return compressPdf(pdfData, imageQuality, forceSourceTextCompression, true); - } - - public static byte[] compressPdf(byte[] pdfData, Integer imageQuality) throws IOException { - return compressPdf(pdfData, imageQuality, false, true); - } - - public static byte[] compressPdf(byte[] pdfData) throws IOException { - return compressPdf(pdfData, 85, false, true); - } - private static byte[] documentToBytes(PDDocument document) throws IOException { var outputStream = new ByteArrayOutputStream(); document.save(outputStream); diff --git a/src/main/java/com/mindee/pdf/PDFExtraction.java b/src/main/java/com/mindee/pdf/PDFExtraction.java new file mode 100644 index 000000000..542540d94 --- /dev/null +++ b/src/main/java/com/mindee/pdf/PDFExtraction.java @@ -0,0 +1,38 @@ +package com.mindee.pdf; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.apache.pdfbox.pdmodel.PDDocument; + +public interface PDFExtraction { +// /** +// * Render a single page of a PDF as an image. +// */ +// PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException; +// +// default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException { +// return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber); +// } + +// /** +// * Render all pages of a PDF as images. +// */ +// List pdfToImages(byte[] fileBytes, String filename) throws IOException; +// +// default List pdfToImages(LocalInputSource source) throws IOException { +// return pdfToImages(source.getFile(), source.getFilename()); +// } + + public byte[] mergePdfPages(File file, List pageNumbers) throws IOException; + + default byte[] mergePdfPages(PDDocument document, List pageNumbers) throws IOException { + return mergePdfPages(document, pageNumbers, true); + } + + public byte[] mergePdfPages( + PDDocument document, + List pageNumbers, + boolean closeOriginal + ) throws IOException; +} diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/PDFExtractorBase.java similarity index 71% rename from src/main/java/com/mindee/pdf/BasePDFExtractor.java rename to src/main/java/com/mindee/pdf/PDFExtractorBase.java index 8cb2f7070..e8258b213 100644 --- a/src/main/java/com/mindee/pdf/BasePDFExtractor.java +++ b/src/main/java/com/mindee/pdf/PDFExtractorBase.java @@ -17,13 +17,16 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; /** * PDF extraction class. */ -public class BasePDFExtractor { +public class PDFExtractorBase implements PDFExtraction { protected final PDDocument sourcePdf; protected final String filename; @@ -33,7 +36,7 @@ public class BasePDFExtractor { * @param source The local source. * @throws IOException Throws if the file can't be accessed. */ - protected BasePDFExtractor(LocalInputSource source) throws IOException { + public PDFExtractorBase(LocalInputSource source) throws IOException { this.filename = source.getFilename(); if (source.isPdf()) { this.sourcePdf = Loader.loadPDF(source.getFile()); @@ -57,6 +60,51 @@ protected BasePDFExtractor(LocalInputSource source) throws IOException { } } +// @Override +// public PdfPageImage pdfPageToImage( +// byte[] fileBytes, +// String filename, +// int pageNumber +// ) throws IOException { +// int index = pageNumber - 1; +// PDDocument document = Loader.loadPDF(fileBytes); +// var pdfRenderer = new PDFRenderer(document); +// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); +// document.close(); +// return new PdfPageImage(imageBuffer, index, filename, "jpg"); +// } +// +// @Override +// public List pdfToImages(byte[] fileBytes, String filename) throws IOException { +// PDDocument document = Loader.loadPDF(fileBytes); +// var pdfRenderer = new PDFRenderer(document); +// List pdfPageImages = new ArrayList<>(); +// for (int i = 0; i < document.getNumberOfPages(); i++) { +// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); +// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); +// } +// document.close(); +// return pdfPageImages; +// } + + private BufferedImage pdfPageToImageBuffer( + int index, + PDDocument document, + PDFRenderer pdfRenderer + ) throws IOException { + PDRectangle bbox = document.getPage(index).getBBox(); + float dimension = bbox.getWidth() * bbox.getHeight(); + int dpi; + if (dimension < 200000) { + dpi = 300; + } else if (dimension < 300000) { + dpi = 250; + } else { + dpi = 200; + } + return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); + } + /** * Get the number of pages in the PDF file. * @@ -155,19 +203,19 @@ private static byte[] createPdfFromExistingPdf( * @param file The PDF file. * @param pageNumbers Lit of page numbers to merge together. */ - public static byte[] mergePdfPages(File file, List pageNumbers) throws IOException { + @Override + public byte[] mergePdfPages(File file, List pageNumbers) throws IOException { PDDocument document = Loader.loadPDF(file); return mergePdfPages(document, pageNumbers, true); } - public static byte[] mergePdfPages( - PDDocument document, - List pageNumbers - ) throws IOException { + @Override + public byte[] mergePdfPages(PDDocument document, List pageNumbers) throws IOException { return mergePdfPages(document, pageNumbers, true); } - public static byte[] mergePdfPages( + @Override + public byte[] mergePdfPages( PDDocument document, List pageNumbers, boolean closeOriginal diff --git a/src/main/java/com/mindee/pdf/InputSourcePDFOperation.java b/src/main/java/com/mindee/pdf/PDFInputSource.java similarity index 80% rename from src/main/java/com/mindee/pdf/InputSourcePDFOperation.java rename to src/main/java/com/mindee/pdf/PDFInputSource.java index 9c83a7eed..c3b0eedef 100644 --- a/src/main/java/com/mindee/pdf/InputSourcePDFOperation.java +++ b/src/main/java/com/mindee/pdf/PDFInputSource.java @@ -1,19 +1,22 @@ package com.mindee.pdf; -import com.mindee.MindeeException; import com.mindee.input.LocalInputSource; import com.mindee.input.PageOptions; import java.io.IOException; -public interface InputSourcePDFOperation { +public interface PDFInputSource { /** * Split a PDF file. + * + * @param fileBytes A byte array representing a PDF. */ SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; /** * Get the number of pages in a PDF file. + * + * @param fileBytes A byte array representing a PDF. */ int getNumberOfPages(byte[] fileBytes) throws IOException; @@ -23,6 +26,8 @@ default int getNumberOfPages(LocalInputSource inputSource) throws IOException { /** * Returns true if the file is a PDF. + * + * @param fileBytes A byte array representing a PDF. */ boolean isPdf(byte[] fileBytes); @@ -31,7 +36,6 @@ default int getNumberOfPages(LocalInputSource inputSource) throws IOException { * * @param fileBytes A byte array representing a PDF. * @return True if at least one character exists in one page. - * @throws MindeeException if the file could not be read. */ boolean hasSourceText(byte[] fileBytes); } diff --git a/src/main/java/com/mindee/pdf/PDFBoxApi.java b/src/main/java/com/mindee/pdf/PDFInputSourcer.java similarity index 81% rename from src/main/java/com/mindee/pdf/PDFBoxApi.java rename to src/main/java/com/mindee/pdf/PDFInputSourcer.java index fe41fc23c..f23ba4d61 100644 --- a/src/main/java/com/mindee/pdf/PDFBoxApi.java +++ b/src/main/java/com/mindee/pdf/PDFInputSourcer.java @@ -24,7 +24,7 @@ /** * Allows performing various operations on PDFs. */ -public final class PDFBoxApi implements InputSourcePDFOperation { +public final class PDFInputSourcer implements PDFInputSource { @Override public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException { @@ -107,33 +107,6 @@ public boolean hasSourceText(byte[] fileBytes) { return false; } -// @Override -// public PdfPageImage pdfPageToImage( -// byte[] fileBytes, -// String filename, -// int pageNumber -// ) throws IOException { -// int index = pageNumber - 1; -// PDDocument document = Loader.loadPDF(fileBytes); -// var pdfRenderer = new PDFRenderer(document); -// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); -// document.close(); -// return new PdfPageImage(imageBuffer, index, filename, "jpg"); -// } - -// @Override -// public List pdfToImages(byte[] fileBytes, String filename) throws IOException { -// PDDocument document = Loader.loadPDF(fileBytes); -// var pdfRenderer = new PDFRenderer(document); -// List pdfPageImages = new ArrayList<>(); -// for (int i = 0; i < document.getNumberOfPages(); i++) { -// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); -// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); -// } -// document.close(); -// return pdfPageImages; -// } - private BufferedImage pdfPageToImageBuffer( int index, PDDocument document, diff --git a/src/main/java/com/mindee/v1/MindeeClient.java b/src/main/java/com/mindee/v1/MindeeClient.java index cd167d2b1..e80ac4822 100644 --- a/src/main/java/com/mindee/v1/MindeeClient.java +++ b/src/main/java/com/mindee/v1/MindeeClient.java @@ -4,8 +4,8 @@ import com.mindee.input.LocalInputSource; import com.mindee.input.PageOptions; import com.mindee.input.URLInputSource; -import com.mindee.pdf.PDFBoxApi; -import com.mindee.pdf.PDFOperation; +import com.mindee.pdf.PDFInputSource; +import com.mindee.pdf.PDFInputSourcer; import com.mindee.v1.clientOptions.PollingOptions; import com.mindee.v1.clientOptions.PredictOptions; import com.mindee.v1.clientOptions.WorkflowOptions; @@ -26,7 +26,7 @@ */ public class MindeeClient { - protected PDFOperation pdfOperation; + protected PDFInputSource pdfOperation; private final MindeeApiV1 mindeeApi; /** @@ -34,7 +34,7 @@ public class MindeeClient { * You'll need to set the API key in the environment for this approach to work properly. */ public MindeeClient() { - this.pdfOperation = new PDFBoxApi(); + this.pdfOperation = new PDFInputSourcer(); this.mindeeApi = createDefaultApi(""); } @@ -44,7 +44,7 @@ public MindeeClient() { * @param apiKey The api key to use. */ public MindeeClient(String apiKey) { - this.pdfOperation = new PDFBoxApi(); + this.pdfOperation = new PDFInputSourcer(); this.mindeeApi = createDefaultApi(apiKey); } @@ -54,7 +54,7 @@ public MindeeClient(String apiKey) { * @param mindeeApi The MindeeApi implementation to be used by the created MindeeClient. */ public MindeeClient(MindeeApiV1 mindeeApi) { - this.pdfOperation = new PDFBoxApi(); + this.pdfOperation = new PDFInputSourcer(); this.mindeeApi = mindeeApi; } @@ -64,7 +64,7 @@ public MindeeClient(MindeeApiV1 mindeeApi) { * @param pdfOperation The PdfOperation implementation to be used by the created MindeeClient. * @param mindeeApi The MindeeApi implementation to be used by the created MindeeClient. */ - public MindeeClient(PDFOperation pdfOperation, MindeeApiV1 mindeeApi) { + public MindeeClient(PDFInputSource pdfOperation, MindeeApiV1 mindeeApi) { this.pdfOperation = pdfOperation; this.mindeeApi = mindeeApi; } @@ -127,13 +127,8 @@ protected byte[] getSplitFile( LocalInputSource localInputSource, PageOptions pageOptions ) throws IOException { - byte[] splitFile; - if (pageOptions == null || !localInputSource.isPdf()) { - splitFile = localInputSource.getFile(); - } else { - splitFile = pdfOperation.split(localInputSource.getFile(), pageOptions).getFile(); - } - return splitFile; + localInputSource.applyPageOptions(pageOptions); + return localInputSource.getFile(); } /** diff --git a/src/main/java/com/mindee/v1/pdf/PDFExtractor.java b/src/main/java/com/mindee/v1/pdf/PDFExtractor.java index e22460650..405e5049f 100644 --- a/src/main/java/com/mindee/v1/pdf/PDFExtractor.java +++ b/src/main/java/com/mindee/v1/pdf/PDFExtractor.java @@ -1,8 +1,8 @@ package com.mindee.v1.pdf; import com.mindee.input.LocalInputSource; -import com.mindee.pdf.BasePDFExtractor; import com.mindee.pdf.ExtractedPDF; +import com.mindee.pdf.PDFExtractorBase; import com.mindee.v1.product.invoicesplitter.InvoiceSplitterV1InvoicePageGroup; import java.io.IOException; import java.util.ArrayList; @@ -12,7 +12,7 @@ /** * PDF extraction class. */ -public class PDFExtractor extends BasePDFExtractor { +public class PDFExtractor extends PDFExtractorBase { /** * Init from a {@link LocalInputSource}. diff --git a/src/test/java/com/mindee/input/FileCompressionTest.java b/src/test/java/com/mindee/input/FileCompressionTest.java index 50b2677cf..5235edef9 100644 --- a/src/test/java/com/mindee/input/FileCompressionTest.java +++ b/src/test/java/com/mindee/input/FileCompressionTest.java @@ -14,7 +14,6 @@ import java.util.stream.Collectors; import javax.imageio.ImageIO; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -251,13 +250,14 @@ public void testPdfResizeFromCompressor() throws IOException { Path outputDir = getResourcePath("output"); Path inputPath = getV1ResourcePath("products/invoice_splitter/default_sample.pdf"); LocalInputSource pdfResizeInput = new LocalInputSource(inputPath.toString()); - + pdfResizeInput.compress(); + var compressor = new PDFCompressor(); List resizes = Arrays .asList( - PDFCompressor.compressPdf(pdfResizeInput.getFile()), - PDFCompressor.compressPdf(pdfResizeInput.getFile(), 75), - PDFCompressor.compressPdf(pdfResizeInput.getFile(), 50), - PDFCompressor.compressPdf(pdfResizeInput.getFile(), 10) + compressor.compressPdf(pdfResizeInput.getFile()), + compressor.compressPdf(pdfResizeInput.getFile(), 75), + compressor.compressPdf(pdfResizeInput.getFile(), 50), + compressor.compressPdf(pdfResizeInput.getFile(), 10) ); List outputPaths = Arrays @@ -323,11 +323,11 @@ public void testPdfResizeFromCompressor() throws IOException { public void testPdfResizeWithTextKeepsText() throws IOException { Path inputPath = getResourcePath("file_types/pdf/multipage.pdf"); LocalInputSource initialWithText = new LocalInputSource(inputPath.toString()); - byte[] compressedWithText = PDFCompressor - .compressPdf(initialWithText.getFile(), 100, true, false); - PDDocument originalDoc = Loader.loadPDF(initialWithText.getFile()); - PDDocument compressedDoc = Loader.loadPDF(compressedWithText); + var originalDoc = Loader.loadPDF(initialWithText.getFile()); + + var compressedWithText = initialWithText.compress(100, true, false).getFile(); + var compressedDoc = Loader.loadPDF(compressedWithText); Assertions.assertEquals(originalDoc.getNumberOfPages(), compressedDoc.getNumberOfPages()); Assertions.assertNotEquals(originalDoc.hashCode(), compressedDoc.hashCode()); diff --git a/src/test/java/com/mindee/pdf/PDFOperationTest.java b/src/test/java/com/mindee/pdf/PDFOperationTest.java index 62ff58150..8cf07c4d2 100644 --- a/src/test/java/com/mindee/pdf/PDFOperationTest.java +++ b/src/test/java/com/mindee/pdf/PDFOperationTest.java @@ -19,7 +19,7 @@ public class PDFOperationTest { - private final InputSourcePDFOperation pdfOperation = new PDFBoxApi(); + private final PDFInputSource pdfOperation = new PDFInputSourcer(); // @Test // public void shouldConvertSinglePageToJpg() throws IOException { From 08585d6565fccee716447675387674fc434e99b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?ianar=C3=A9=20s=C3=A9vi?= Date: Tue, 28 Apr 2026 17:58:52 +0200 Subject: [PATCH 3/4] will it work? --- .../java/com/mindee/image/ImageExtractor.java | 2 +- .../com/mindee/input/LocalInputSource.java | 48 +++++----- ...tractorBase.java => BasePDFExtractor.java} | 87 +------------------ .../java/com/mindee/pdf/ExtractedPDF.java | 19 ++-- .../java/com/mindee/pdf/PDFCompression.java | 18 ++-- .../java/com/mindee/pdf/PDFCompressor.java | 54 +++++++++--- .../java/com/mindee/pdf/PDFExtraction.java | 38 -------- .../com/mindee/pdf/PDFInputOperation.java | 28 ++++++ ...nputSourcer.java => PDFInputOperator.java} | 67 ++------------ .../java/com/mindee/pdf/PDFInputSource.java | 41 --------- src/main/java/com/mindee/v1/MindeeClient.java | 14 +-- .../java/com/mindee/v1/pdf/PDFExtractor.java | 4 +- .../com/mindee/input/FileCompressionTest.java | 8 +- .../mindee/input/LocalInputSourceTest.java | 9 +- .../java/com/mindee/pdf/PDFOperationTest.java | 39 +-------- .../InvoiceSplitterAutoExtractionIT.java | 1 - .../com/mindee/v1/pdf/PDFExtractorTest.java | 6 +- 17 files changed, 146 insertions(+), 337 deletions(-) rename src/main/java/com/mindee/pdf/{PDFExtractorBase.java => BasePDFExtractor.java} (61%) delete mode 100644 src/main/java/com/mindee/pdf/PDFExtraction.java create mode 100644 src/main/java/com/mindee/pdf/PDFInputOperation.java rename src/main/java/com/mindee/pdf/{PDFInputSourcer.java => PDFInputOperator.java} (57%) delete mode 100644 src/main/java/com/mindee/pdf/PDFInputSource.java diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index 077bc6e0c..caf728781 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -29,7 +29,7 @@ public ImageExtractor(LocalInputSource source) throws IOException { this.filename = source.getFilename(); this.pageImages = new ArrayList<>(); - if (source.isPdf()) { + if (source.isPDF()) { this.saveFormat = "jpg"; var pdfPageImages = pdfToImages(source.getFile(), this.filename); for (PdfPageImage pdfPageImage : pdfPageImages) { diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index 06a53481a..a3b536f1a 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -3,8 +3,8 @@ import com.mindee.image.ImageCompressor; import com.mindee.pdf.PDFCompression; import com.mindee.pdf.PDFCompressor; -import com.mindee.pdf.PDFInputSource; -import com.mindee.pdf.PDFInputSourcer; +import com.mindee.pdf.PDFInputOperation; +import com.mindee.pdf.PDFInputOperator; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -18,16 +18,18 @@ /** * A source document for Mindee API operations. */ -public final class LocalInputSource { +public class LocalInputSource { @Getter private byte[] file; @Getter private final String filename; @Setter - private PDFInputSource pdfOperation; + private PDFInputOperation pdfInputOperator; @Setter - private PDFCompressor pdfCompressor; + private PDFCompression pdfCompressor; + // Store here to avoid recalculating every time. + private Boolean isPDF; public LocalInputSource(InputStream file, String filename) throws IOException { this.file = IOUtils.toByteArray(file); @@ -60,14 +62,14 @@ public LocalInputSource(String fileAsBase64, String filename) { this.filename = filename; } - public PDFInputSource getPdfOperation() { - if (this.pdfOperation == null) { - this.pdfOperation = new PDFInputSourcer(); + private PDFInputOperation getPdfInputOperator() { + if (this.pdfInputOperator == null) { + this.pdfInputOperator = new PDFInputOperator(); } - return this.pdfOperation; + return this.pdfInputOperator; } - public PDFCompression getPdfCompressor() { + private PDFCompression getPdfCompressor() { if (this.pdfCompressor == null) { this.pdfCompressor = new PDFCompressor(); } @@ -81,10 +83,10 @@ public PDFCompression getPdfCompressor() { * @throws IOException If an I/O error occurs during the PDF operation. */ public int getPageCount() throws IOException { - if (!this.isPdf()) { + if (!this.isPDF()) { return 1; } - return getPdfOperation().getNumberOfPages(this.file); + return getPdfInputOperator().getPageCount(this.file); } /** @@ -94,17 +96,19 @@ public int getPageCount() throws IOException { * @throws IOException If an I/O error occurs during the PDF operation. */ public void applyPageOptions(PageOptions pageOptions) throws IOException { - if (pageOptions != null && this.isPdf()) { - this.file = getPdfOperation().split(this.file, pageOptions).getFile(); + if (pageOptions != null && this.isPDF()) { + this.file = getPdfInputOperator().split(this.file, pageOptions).getFile(); } } - public boolean isPdf() { - return getPdfOperation().isPdf(this.file); - } - - public boolean hasSourceText() { - return getPdfOperation().hasSourceText(this.file); + /** + * Returns true if the file is a PDF. + */ + public boolean isPDF() { + if (this.isPDF == null) { + this.isPDF = getPdfInputOperator().isPDF(this.file); + } + return this.isPDF; } public LocalInputSource compress( @@ -114,9 +118,9 @@ public LocalInputSource compress( Boolean forceSourceText, Boolean disableSourceText ) throws IOException { - if (isPdf()) { + if (isPDF()) { this.file = getPdfCompressor() - .compressPdf(this.file, quality, forceSourceText, disableSourceText); + .compressPDF(this.file, quality, forceSourceText, disableSourceText); } else { this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight); } diff --git a/src/main/java/com/mindee/pdf/PDFExtractorBase.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java similarity index 61% rename from src/main/java/com/mindee/pdf/PDFExtractorBase.java rename to src/main/java/com/mindee/pdf/BasePDFExtractor.java index e8258b213..d7263d803 100644 --- a/src/main/java/com/mindee/pdf/PDFExtractorBase.java +++ b/src/main/java/com/mindee/pdf/BasePDFExtractor.java @@ -6,7 +6,6 @@ import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -17,16 +16,13 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.rendering.PDFRenderer; /** * PDF extraction class. */ -public class PDFExtractorBase implements PDFExtraction { +public class BasePDFExtractor { protected final PDDocument sourcePdf; protected final String filename; @@ -36,9 +32,9 @@ public class PDFExtractorBase implements PDFExtraction { * @param source The local source. * @throws IOException Throws if the file can't be accessed. */ - public PDFExtractorBase(LocalInputSource source) throws IOException { + public BasePDFExtractor(LocalInputSource source) throws IOException { this.filename = source.getFilename(); - if (source.isPdf()) { + if (source.isPDF()) { this.sourcePdf = Loader.loadPDF(source.getFile()); } else { var document = new PDDocument(); @@ -60,60 +56,6 @@ public PDFExtractorBase(LocalInputSource source) throws IOException { } } -// @Override -// public PdfPageImage pdfPageToImage( -// byte[] fileBytes, -// String filename, -// int pageNumber -// ) throws IOException { -// int index = pageNumber - 1; -// PDDocument document = Loader.loadPDF(fileBytes); -// var pdfRenderer = new PDFRenderer(document); -// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); -// document.close(); -// return new PdfPageImage(imageBuffer, index, filename, "jpg"); -// } -// -// @Override -// public List pdfToImages(byte[] fileBytes, String filename) throws IOException { -// PDDocument document = Loader.loadPDF(fileBytes); -// var pdfRenderer = new PDFRenderer(document); -// List pdfPageImages = new ArrayList<>(); -// for (int i = 0; i < document.getNumberOfPages(); i++) { -// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); -// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); -// } -// document.close(); -// return pdfPageImages; -// } - - private BufferedImage pdfPageToImageBuffer( - int index, - PDDocument document, - PDFRenderer pdfRenderer - ) throws IOException { - PDRectangle bbox = document.getPage(index).getBBox(); - float dimension = bbox.getWidth() * bbox.getHeight(); - int dpi; - if (dimension < 200000) { - dpi = 300; - } else if (dimension < 300000) { - dpi = 250; - } else { - dpi = 200; - } - return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); - } - - /** - * Get the number of pages in the PDF file. - * - * @return The number of pages in the PDF file. - */ - public int getPageCount() { - return sourcePdf.getNumberOfPages(); - } - /** * Converts an array to a buffered image. * @@ -154,10 +96,7 @@ public List extractSubDocuments( + splitName[1]; extractedPDFs .add( - new ExtractedPDF( - Loader.loadPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false)), - fieldFilename - ) + new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename) ); } return extractedPDFs; @@ -197,24 +136,6 @@ private static byte[] createPdfFromExistingPdf( return output; } - /** - * Merge specified PDF pages together. - * - * @param file The PDF file. - * @param pageNumbers Lit of page numbers to merge together. - */ - @Override - public byte[] mergePdfPages(File file, List pageNumbers) throws IOException { - PDDocument document = Loader.loadPDF(file); - return mergePdfPages(document, pageNumbers, true); - } - - @Override - public byte[] mergePdfPages(PDDocument document, List pageNumbers) throws IOException { - return mergePdfPages(document, pageNumbers, true); - } - - @Override public byte[] mergePdfPages( PDDocument document, List pageNumbers, diff --git a/src/main/java/com/mindee/pdf/ExtractedPDF.java b/src/main/java/com/mindee/pdf/ExtractedPDF.java index 7bfe24c95..8b9c7c256 100644 --- a/src/main/java/com/mindee/pdf/ExtractedPDF.java +++ b/src/main/java/com/mindee/pdf/ExtractedPDF.java @@ -1,29 +1,27 @@ package com.mindee.pdf; import com.mindee.input.LocalInputSource; -import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Paths; import lombok.Getter; -import org.apache.pdfbox.pdmodel.PDDocument; /** * An extracted sub-PDF. */ @Getter public class ExtractedPDF { - private final PDDocument pdf; + private final byte[] fileBytes; private final String filename; /** * Default constructor. * - * @param pdf PDF wrapper object. + * @param fileBytes PDF file as bytes. * @param filename Name of the extracted file. */ - public ExtractedPDF(PDDocument pdf, String filename) { - this.pdf = pdf; + public ExtractedPDF(byte[] fileBytes, String filename) { + this.fileBytes = fileBytes; this.filename = filename; } @@ -35,8 +33,7 @@ public ExtractedPDF(PDDocument pdf, String filename) { */ public void writeToFile(String outputPath) throws IOException { var pdfPath = Paths.get(outputPath, this.filename); - var outputfile = new File(pdfPath.toString()); - this.pdf.save(outputfile); + Files.write(pdfPath, this.fileBytes); } /** @@ -46,8 +43,6 @@ public void writeToFile(String outputPath) throws IOException { * @throws IOException Throws if the file can't be accessed. */ public LocalInputSource asInputSource() throws IOException { - var output = new ByteArrayOutputStream(); - this.pdf.save(output); - return new LocalInputSource(output.toByteArray(), this.filename); + return new LocalInputSource(this.fileBytes, this.filename); } } diff --git a/src/main/java/com/mindee/pdf/PDFCompression.java b/src/main/java/com/mindee/pdf/PDFCompression.java index 7d8f6e2e5..e8619014d 100644 --- a/src/main/java/com/mindee/pdf/PDFCompression.java +++ b/src/main/java/com/mindee/pdf/PDFCompression.java @@ -3,26 +3,26 @@ import java.io.IOException; public interface PDFCompression { - byte[] compressPdf( - byte[] pdfData, + byte[] compressPDF( + byte[] fileBytes, Integer imageQuality, Boolean forceSourceTextCompression, Boolean disableSourceText ) throws IOException; - default byte[] compressPdf( - byte[] pdfData, + default byte[] compressPDF( + byte[] fileBytes, Integer imageQuality, Boolean forceSourceTextCompression ) throws IOException { - return compressPdf(pdfData, imageQuality, forceSourceTextCompression, true); + return compressPDF(fileBytes, imageQuality, forceSourceTextCompression, true); } - default byte[] compressPdf(byte[] pdfData, Integer imageQuality) throws IOException { - return compressPdf(pdfData, imageQuality, false, true); + default byte[] compressPDF(byte[] fileBytes, Integer imageQuality) throws IOException { + return compressPDF(fileBytes, imageQuality, false, true); } - default byte[] compressPdf(byte[] pdfData) throws IOException { - return compressPdf(pdfData, 85, false, true); + default byte[] compressPDF(byte[] fileBytes) throws IOException { + return compressPDF(fileBytes, 85, false, true); } } diff --git a/src/main/java/com/mindee/pdf/PDFCompressor.java b/src/main/java/com/mindee/pdf/PDFCompressor.java index 65c88d99c..0932f1973 100644 --- a/src/main/java/com/mindee/pdf/PDFCompressor.java +++ b/src/main/java/com/mindee/pdf/PDFCompressor.java @@ -1,11 +1,14 @@ package com.mindee.pdf; +import com.mindee.MindeeException; import java.awt.*; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.List; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; @@ -24,21 +27,21 @@ * PDF compression class. */ public class PDFCompressor implements PDFCompression { - PDFInputSourcer pdfInputSourcer; + private final PDFInputOperator pdfInputOperator; public PDFCompressor() { - pdfInputSourcer = new PDFInputSourcer(); + this.pdfInputOperator = new PDFInputOperator(); } @Override - public byte[] compressPdf( - byte[] pdfData, + public byte[] compressPDF( + byte[] fileBytes, Integer imageQuality, Boolean forceSourceTextCompression, Boolean disableSourceText ) throws IOException { - if (!pdfInputSourcer.isPdf(pdfData)) { - return pdfData; + if (!pdfInputOperator.isPDF(fileBytes)) { + return fileBytes; } if (forceSourceTextCompression == null) { @@ -47,14 +50,14 @@ public byte[] compressPdf( if (disableSourceText == null) { disableSourceText = true; } - if (!forceSourceTextCompression && pdfInputSourcer.hasSourceText(pdfData)) { + if (!forceSourceTextCompression && hasSourceText(fileBytes)) { System.out .println( "MINDEE WARNING: Found text inside of the provided PDF file. Compression operation aborted." ); - return pdfData; + return fileBytes; } - try (PDDocument inputDoc = Loader.loadPDF(pdfData); PDDocument outputDoc = new PDDocument()) { + try (var inputDoc = Loader.loadPDF(fileBytes); PDDocument outputDoc = new PDDocument()) { var pdfRenderer = new PDFRenderer(inputDoc); @@ -79,6 +82,35 @@ public byte[] compressPdf( } } + /** + * Returns true if the source PDF has source text inside. Returns false for images. + * + * @param fileBytes A byte array representing a PDF. + * @return True if at least one character exists in one page. + * @throws MindeeException if the file could not be read. + */ + private boolean hasSourceText(byte[] fileBytes) { + try { + PDDocument document = Loader + .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); + var stripper = new PDFTextStripper(); + + for (int i = 0; i < document.getNumberOfPages(); i++) { + stripper.setStartPage(i + 1); + stripper.setEndPage(i + 1); + String pageText = stripper.getText(document); + if (!pageText.trim().isEmpty()) { + document.close(); + return true; + } + } + document.close(); + } catch (IOException e) { + return false; + } + return false; + } + private static byte[] documentToBytes(PDDocument document) throws IOException { var outputStream = new ByteArrayOutputStream(); document.save(outputStream); @@ -122,9 +154,9 @@ protected void writeString(String text, List textPositions) throws return; } - TextPosition firstPosition = textPositions.get(0); + var firstPosition = textPositions.get(0); float fontSize = firstPosition.getFontSizeInPt(); - PDColor color = getGraphicsState().getNonStrokingColor(); + var color = getGraphicsState().getNonStrokingColor(); contentStream.beginText(); contentStream.setFont(firstPosition.getFont(), fontSize); contentStream.setNonStrokingColor(convertToAwtColor(color)); diff --git a/src/main/java/com/mindee/pdf/PDFExtraction.java b/src/main/java/com/mindee/pdf/PDFExtraction.java deleted file mode 100644 index 542540d94..000000000 --- a/src/main/java/com/mindee/pdf/PDFExtraction.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.mindee.pdf; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import org.apache.pdfbox.pdmodel.PDDocument; - -public interface PDFExtraction { -// /** -// * Render a single page of a PDF as an image. -// */ -// PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException; -// -// default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException { -// return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber); -// } - -// /** -// * Render all pages of a PDF as images. -// */ -// List pdfToImages(byte[] fileBytes, String filename) throws IOException; -// -// default List pdfToImages(LocalInputSource source) throws IOException { -// return pdfToImages(source.getFile(), source.getFilename()); -// } - - public byte[] mergePdfPages(File file, List pageNumbers) throws IOException; - - default byte[] mergePdfPages(PDDocument document, List pageNumbers) throws IOException { - return mergePdfPages(document, pageNumbers, true); - } - - public byte[] mergePdfPages( - PDDocument document, - List pageNumbers, - boolean closeOriginal - ) throws IOException; -} diff --git a/src/main/java/com/mindee/pdf/PDFInputOperation.java b/src/main/java/com/mindee/pdf/PDFInputOperation.java new file mode 100644 index 000000000..a69cc50af --- /dev/null +++ b/src/main/java/com/mindee/pdf/PDFInputOperation.java @@ -0,0 +1,28 @@ +package com.mindee.pdf; + +import com.mindee.input.PageOptions; +import java.io.IOException; + +public interface PDFInputOperation { + + /** + * Split a PDF file. + * + * @param fileBytes A byte array representing a PDF. + */ + SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; + + /** + * Get the number of pages in a PDF file. + * + * @param fileBytes A byte array representing a PDF. + */ + int getPageCount(byte[] fileBytes) throws IOException; + + /** + * Returns true if the file is a PDF. + * + * @param fileBytes A byte array representing a PDF. + */ + boolean isPDF(byte[] fileBytes); +} diff --git a/src/main/java/com/mindee/pdf/PDFInputSourcer.java b/src/main/java/com/mindee/pdf/PDFInputOperator.java similarity index 57% rename from src/main/java/com/mindee/pdf/PDFInputSourcer.java rename to src/main/java/com/mindee/pdf/PDFInputOperator.java index f23ba4d61..e5657fcfa 100644 --- a/src/main/java/com/mindee/pdf/PDFInputSourcer.java +++ b/src/main/java/com/mindee/pdf/PDFInputOperator.java @@ -2,7 +2,6 @@ import com.mindee.MindeeException; import com.mindee.input.PageOptions; -import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -16,26 +15,22 @@ import org.apache.pdfbox.Loader; import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.text.PDFTextStripper; /** * Allows performing various operations on PDFs. */ -public final class PDFInputSourcer implements PDFInputSource { +public final class PDFInputOperator implements PDFInputOperation { @Override public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException { - if (!checkPdfOpen(fileBytes)) { + if (!isPDFOpen(fileBytes)) { throw new MindeeException("This document cannot be open and cannot be split."); } try (var originalDocument = Loader.loadPDF(fileBytes)) { try (var splitDocument = new PDDocument()) { - int totalOriginalPages = getNumberOfPages(fileBytes); + int totalOriginalPages = getPageCount(fileBytes); if (totalOriginalPages < pageOptions.getOnMinPages()) { return new SplitPDF(fileBytes, totalOriginalPages); @@ -50,14 +45,14 @@ public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOExcept try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { splitDocument.save(outputStream); byte[] splitPdf = outputStream.toByteArray(); - return new SplitPDF(splitPdf, getNumberOfPages(splitPdf)); + return new SplitPDF(splitPdf, getPageCount(splitPdf)); } } } } @Override - public int getNumberOfPages(byte[] fileBytes) throws IOException { + public int getPageCount(byte[] fileBytes) throws IOException { var document = Loader.loadPDF(fileBytes); int pageCount = document.getNumberOfPages(); document.close(); @@ -68,7 +63,7 @@ public int getNumberOfPages(byte[] fileBytes) throws IOException { * Returns true if the file is a PDF. */ @Override - public boolean isPdf(byte[] fileBytes) { + public boolean isPDF(byte[] fileBytes) { try { Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); } catch (IOException e) { @@ -77,54 +72,6 @@ public boolean isPdf(byte[] fileBytes) { return true; } - /** - * Returns true if the source PDF has source text inside. Returns false for images. - * - * @param fileBytes A byte array representing a PDF. - * @return True if at least one character exists in one page. - * @throws MindeeException if the file could not be read. - */ - @Override - public boolean hasSourceText(byte[] fileBytes) { - try { - PDDocument document = Loader - .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); - PDFTextStripper stripper = new PDFTextStripper(); - - for (int i = 0; i < document.getNumberOfPages(); i++) { - stripper.setStartPage(i + 1); - stripper.setEndPage(i + 1); - String pageText = stripper.getText(document); - if (!pageText.trim().isEmpty()) { - document.close(); - return true; - } - } - document.close(); - } catch (IOException e) { - return false; - } - return false; - } - - private BufferedImage pdfPageToImageBuffer( - int index, - PDDocument document, - PDFRenderer pdfRenderer - ) throws IOException { - PDRectangle bbox = document.getPage(index).getBBox(); - float dimension = bbox.getWidth() * bbox.getHeight(); - int dpi; - if (dimension < 200000) { - dpi = 300; - } else if (dimension < 300000) { - dpi = 250; - } else { - dpi = 200; - } - return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); - } - private List getPageRanges(PageOptions pageOptions, Integer numberOfPages) { Set pages = Optional @@ -147,7 +94,7 @@ private List getPageRanges(PageOptions pageOptions, Integer numberOfPag } } - private boolean checkPdfOpen(byte[] fileBytes) { + private boolean isPDFOpen(byte[] fileBytes) { boolean opens = false; try { Loader.loadPDF(fileBytes).close(); diff --git a/src/main/java/com/mindee/pdf/PDFInputSource.java b/src/main/java/com/mindee/pdf/PDFInputSource.java deleted file mode 100644 index c3b0eedef..000000000 --- a/src/main/java/com/mindee/pdf/PDFInputSource.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.mindee.pdf; - -import com.mindee.input.LocalInputSource; -import com.mindee.input.PageOptions; -import java.io.IOException; - -public interface PDFInputSource { - - /** - * Split a PDF file. - * - * @param fileBytes A byte array representing a PDF. - */ - SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; - - /** - * Get the number of pages in a PDF file. - * - * @param fileBytes A byte array representing a PDF. - */ - int getNumberOfPages(byte[] fileBytes) throws IOException; - - default int getNumberOfPages(LocalInputSource inputSource) throws IOException { - return getNumberOfPages(inputSource.getFile()); - } - - /** - * Returns true if the file is a PDF. - * - * @param fileBytes A byte array representing a PDF. - */ - boolean isPdf(byte[] fileBytes); - - /** - * Returns true if the source PDF has source text inside. Returns false for images. - * - * @param fileBytes A byte array representing a PDF. - * @return True if at least one character exists in one page. - */ - boolean hasSourceText(byte[] fileBytes); -} diff --git a/src/main/java/com/mindee/v1/MindeeClient.java b/src/main/java/com/mindee/v1/MindeeClient.java index e80ac4822..67a20cf9b 100644 --- a/src/main/java/com/mindee/v1/MindeeClient.java +++ b/src/main/java/com/mindee/v1/MindeeClient.java @@ -4,8 +4,8 @@ import com.mindee.input.LocalInputSource; import com.mindee.input.PageOptions; import com.mindee.input.URLInputSource; -import com.mindee.pdf.PDFInputSource; -import com.mindee.pdf.PDFInputSourcer; +import com.mindee.pdf.PDFInputOperation; +import com.mindee.pdf.PDFInputOperator; import com.mindee.v1.clientOptions.PollingOptions; import com.mindee.v1.clientOptions.PredictOptions; import com.mindee.v1.clientOptions.WorkflowOptions; @@ -26,7 +26,7 @@ */ public class MindeeClient { - protected PDFInputSource pdfOperation; + protected PDFInputOperation pdfOperation; private final MindeeApiV1 mindeeApi; /** @@ -34,7 +34,7 @@ public class MindeeClient { * You'll need to set the API key in the environment for this approach to work properly. */ public MindeeClient() { - this.pdfOperation = new PDFInputSourcer(); + this.pdfOperation = new PDFInputOperator(); this.mindeeApi = createDefaultApi(""); } @@ -44,7 +44,7 @@ public MindeeClient() { * @param apiKey The api key to use. */ public MindeeClient(String apiKey) { - this.pdfOperation = new PDFInputSourcer(); + this.pdfOperation = new PDFInputOperator(); this.mindeeApi = createDefaultApi(apiKey); } @@ -54,7 +54,7 @@ public MindeeClient(String apiKey) { * @param mindeeApi The MindeeApi implementation to be used by the created MindeeClient. */ public MindeeClient(MindeeApiV1 mindeeApi) { - this.pdfOperation = new PDFInputSourcer(); + this.pdfOperation = new PDFInputOperator(); this.mindeeApi = mindeeApi; } @@ -64,7 +64,7 @@ public MindeeClient(MindeeApiV1 mindeeApi) { * @param pdfOperation The PdfOperation implementation to be used by the created MindeeClient. * @param mindeeApi The MindeeApi implementation to be used by the created MindeeClient. */ - public MindeeClient(PDFInputSource pdfOperation, MindeeApiV1 mindeeApi) { + public MindeeClient(PDFInputOperation pdfOperation, MindeeApiV1 mindeeApi) { this.pdfOperation = pdfOperation; this.mindeeApi = mindeeApi; } diff --git a/src/main/java/com/mindee/v1/pdf/PDFExtractor.java b/src/main/java/com/mindee/v1/pdf/PDFExtractor.java index 405e5049f..e22460650 100644 --- a/src/main/java/com/mindee/v1/pdf/PDFExtractor.java +++ b/src/main/java/com/mindee/v1/pdf/PDFExtractor.java @@ -1,8 +1,8 @@ package com.mindee.v1.pdf; import com.mindee.input.LocalInputSource; +import com.mindee.pdf.BasePDFExtractor; import com.mindee.pdf.ExtractedPDF; -import com.mindee.pdf.PDFExtractorBase; import com.mindee.v1.product.invoicesplitter.InvoiceSplitterV1InvoicePageGroup; import java.io.IOException; import java.util.ArrayList; @@ -12,7 +12,7 @@ /** * PDF extraction class. */ -public class PDFExtractor extends PDFExtractorBase { +public class PDFExtractor extends BasePDFExtractor { /** * Init from a {@link LocalInputSource}. diff --git a/src/test/java/com/mindee/input/FileCompressionTest.java b/src/test/java/com/mindee/input/FileCompressionTest.java index 5235edef9..a8ef329d1 100644 --- a/src/test/java/com/mindee/input/FileCompressionTest.java +++ b/src/test/java/com/mindee/input/FileCompressionTest.java @@ -254,10 +254,10 @@ public void testPdfResizeFromCompressor() throws IOException { var compressor = new PDFCompressor(); List resizes = Arrays .asList( - compressor.compressPdf(pdfResizeInput.getFile()), - compressor.compressPdf(pdfResizeInput.getFile(), 75), - compressor.compressPdf(pdfResizeInput.getFile(), 50), - compressor.compressPdf(pdfResizeInput.getFile(), 10) + compressor.compressPDF(pdfResizeInput.getFile()), + compressor.compressPDF(pdfResizeInput.getFile(), 75), + compressor.compressPDF(pdfResizeInput.getFile(), 50), + compressor.compressPDF(pdfResizeInput.getFile(), 10) ); List outputPaths = Arrays diff --git a/src/test/java/com/mindee/input/LocalInputSourceTest.java b/src/test/java/com/mindee/input/LocalInputSourceTest.java index 6482bf51e..81788e354 100644 --- a/src/test/java/com/mindee/input/LocalInputSourceTest.java +++ b/src/test/java/com/mindee/input/LocalInputSourceTest.java @@ -16,8 +16,7 @@ public class LocalInputSourceTest { void assertMultipagePDF(LocalInputSource inputSource, Path filePath) throws IOException { Assertions.assertNotNull(inputSource); - Assertions.assertTrue(inputSource.isPdf()); - Assertions.assertTrue(inputSource.hasSourceText()); + Assertions.assertTrue(inputSource.isPDF()); Assertions.assertEquals(3, inputSource.getPageCount()); Assertions.assertEquals("multipage_cut-3.pdf", inputSource.getFilename()); Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(filePath)); @@ -64,15 +63,13 @@ void loadPDF__withoutText_mustNotDetectSourceText() throws MindeeException, IOEx String encodedFile = Base64.encodeBase64String(Files.readAllBytes(filePath)); var localInputSource = new LocalInputSource(encodedFile, "default_sample.pdf"); Assertions.assertNotNull(localInputSource); - Assertions.assertTrue(localInputSource.isPdf()); - Assertions.assertFalse(localInputSource.hasSourceText()); + Assertions.assertTrue(localInputSource.isPDF()); } void assertImage(LocalInputSource inputSource, Path filePath) throws IOException { Assertions.assertNotNull(inputSource); - Assertions.assertFalse(inputSource.isPdf()); - Assertions.assertFalse(inputSource.hasSourceText()); + Assertions.assertFalse(inputSource.isPDF()); Assertions.assertEquals(1, inputSource.getPageCount()); Assertions.assertEquals("receipt.jpg", inputSource.getFilename()); Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(filePath)); diff --git a/src/test/java/com/mindee/pdf/PDFOperationTest.java b/src/test/java/com/mindee/pdf/PDFOperationTest.java index 8cf07c4d2..b831d92cd 100644 --- a/src/test/java/com/mindee/pdf/PDFOperationTest.java +++ b/src/test/java/com/mindee/pdf/PDFOperationTest.java @@ -19,40 +19,7 @@ public class PDFOperationTest { - private final PDFInputSource pdfOperation = new PDFInputSourcer(); - -// @Test -// public void shouldConvertSinglePageToJpg() throws IOException { -// LocalInputSource source = new LocalInputSource( -// "src/test/resources/file_types/pdf/multipage.pdf" -// ); -// PdfPageImage pdfPageImage = pdfOperation.pdfPageToImage(source, 3); -// Assertions.assertNotNull(pdfPageImage.getImage()); -// Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); -// pdfPageImage.writeToFile("src/test/resources/output/"); -// Assertions -// .assertTrue( -// Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) -// ); -// } - -// @Test -// public void shouldConvertAllPagesToJpg() throws IOException { -// LocalInputSource source = new LocalInputSource( -// "src/test/resources/file_types/pdf/multipage.pdf" -// ); -// List pdfPageImages = pdfOperation.pdfToImages(source); -// for (PdfPageImage pdfPageImage : pdfPageImages) { -// Assertions.assertNotNull(pdfPageImage.getImage()); -// Assertions -// .assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); -// pdfPageImage.writeToFile("src/test/resources/output/"); -// Assertions -// .assertTrue( -// Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) -// ); -// } -// } + private final PDFInputOperation pdfOperation = new PDFInputOperator(); @Test public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws IOException { @@ -66,7 +33,7 @@ public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws document.close(); File file = getResourcePath("output/test.pdf").toFile(); LocalInputSource source = new LocalInputSource(file); - Assertions.assertEquals(random, pdfOperation.getNumberOfPages(source)); + Assertions.assertEquals(random, source.getPageCount()); file.delete(); } @@ -123,7 +90,7 @@ public void givenADocumentAndListOfPagesToRemove_whenSplit_thenReturnsOnlyNotRem } @Test - public void givenADocumentOtherThantAPdf_whenSplit_mustFail() throws IOException { + public void givenADocumentOtherThantAPdf_whenSplit_mustFail() { PageOptions pageOptions = new PageOptions.Builder() .pageIndexes(new Integer[] { 1, 2, 3 }) diff --git a/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java b/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java index 794cf7874..dd33c1663 100644 --- a/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java +++ b/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java @@ -68,7 +68,6 @@ public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedExc InvoiceSplitterV1 inference = document.getInference(); PDFExtractor extractor = new PDFExtractor(invoiceSplitterInputSource); - Assertions.assertEquals(2, extractor.getPageCount()); List extractedPDFsStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), false); Assertions.assertEquals(2, extractedPDFsStrict.size()); diff --git a/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java b/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java index e30aada8e..e68a3d52e 100644 --- a/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java +++ b/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java @@ -28,7 +28,6 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException { InvoiceSplitterV1 inference = response.getDocument().getInference(); PDFExtractor extractor = new PDFExtractor(pdf); - Assertions.assertEquals(5, extractor.getPageCount()); var extractedPDFSNoStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), false); Assertions.assertEquals(3, extractedPDFSNoStrict.size()); @@ -39,14 +38,13 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException { @Test public void givenAPDF_shouldExtractInvoicesStrict() throws IOException { - LocalInputSource pdf = new LocalInputSource( + var inputSource = new LocalInputSource( getV1ResourcePath("products/invoice_splitter/invoice_5p.pdf") ); PredictResponse response = getInvoiceSplitterPrediction(); InvoiceSplitterV1 inference = response.getDocument().getInference(); - PDFExtractor extractor = new PDFExtractor(pdf); - Assertions.assertEquals(5, extractor.getPageCount()); + PDFExtractor extractor = new PDFExtractor(inputSource); var extractedPDFStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), true); Assertions.assertEquals(2, extractedPDFStrict.size()); From baf6e62421a4ed0e7b54b411a638d7930365e505 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?ianar=C3=A9=20s=C3=A9vi?= Date: Wed, 29 Apr 2026 11:21:43 +0200 Subject: [PATCH 4/4] fixes following comments --- .../java/com/mindee/image/ImageExtractor.java | 9 ++-- .../PDFPageImage.java} | 6 +-- .../com/mindee/input/LocalInputSource.java | 47 ++++++++----------- .../com/mindee/input/FileCompressionTest.java | 4 +- 4 files changed, 28 insertions(+), 38 deletions(-) rename src/main/java/com/mindee/{pdf/PdfPageImage.java => image/PDFPageImage.java} (96%) diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index caf728781..e133eda7e 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -4,7 +4,6 @@ import com.mindee.geometry.PositionDataField; import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; -import com.mindee.pdf.PdfPageImage; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -32,7 +31,7 @@ public ImageExtractor(LocalInputSource source) throws IOException { if (source.isPDF()) { this.saveFormat = "jpg"; var pdfPageImages = pdfToImages(source.getFile(), this.filename); - for (PdfPageImage pdfPageImage : pdfPageImages) { + for (PDFPageImage pdfPageImage : pdfPageImages) { this.pageImages.add(pdfPageImage.getImage()); } } else { @@ -44,13 +43,13 @@ public ImageExtractor(LocalInputSource source) throws IOException { } } - public List pdfToImages(byte[] fileBytes, String filename) throws IOException { + public List pdfToImages(byte[] fileBytes, String filename) throws IOException { PDDocument document = Loader.loadPDF(fileBytes); var pdfRenderer = new PDFRenderer(document); - List pdfPageImages = new ArrayList<>(); + List pdfPageImages = new ArrayList<>(); for (int i = 0; i < document.getNumberOfPages(); i++) { var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); - pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); + pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg")); } document.close(); return pdfPageImages; diff --git a/src/main/java/com/mindee/pdf/PdfPageImage.java b/src/main/java/com/mindee/image/PDFPageImage.java similarity index 96% rename from src/main/java/com/mindee/pdf/PdfPageImage.java rename to src/main/java/com/mindee/image/PDFPageImage.java index 9ef2e54d7..fe9493b36 100644 --- a/src/main/java/com/mindee/pdf/PdfPageImage.java +++ b/src/main/java/com/mindee/image/PDFPageImage.java @@ -1,4 +1,4 @@ -package com.mindee.pdf; +package com.mindee.image; import com.mindee.MindeeException; import com.mindee.input.InputSourceUtils; @@ -16,13 +16,13 @@ * A page in a PDF extracted as an image. */ @Getter -public class PdfPageImage { +public class PDFPageImage { private final BufferedImage image; private final int originalIndex; private final String saveFormat; private final String originalFilename; - public PdfPageImage( + public PDFPageImage( BufferedImage image, int originalIndex, String originalFilename, diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index a3b536f1a..073cb046f 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -62,14 +62,14 @@ public LocalInputSource(String fileAsBase64, String filename) { this.filename = filename; } - private PDFInputOperation getPdfInputOperator() { + private PDFInputOperation getPDFInputOperator() { if (this.pdfInputOperator == null) { this.pdfInputOperator = new PDFInputOperator(); } return this.pdfInputOperator; } - private PDFCompression getPdfCompressor() { + private PDFCompression getPDFCompressor() { if (this.pdfCompressor == null) { this.pdfCompressor = new PDFCompressor(); } @@ -86,7 +86,7 @@ public int getPageCount() throws IOException { if (!this.isPDF()) { return 1; } - return getPdfInputOperator().getPageCount(this.file); + return getPDFInputOperator().getPageCount(this.file); } /** @@ -97,7 +97,7 @@ public int getPageCount() throws IOException { */ public void applyPageOptions(PageOptions pageOptions) throws IOException { if (pageOptions != null && this.isPDF()) { - this.file = getPdfInputOperator().split(this.file, pageOptions).getFile(); + this.file = getPDFInputOperator().split(this.file, pageOptions).getFile(); } } @@ -106,61 +106,52 @@ public void applyPageOptions(PageOptions pageOptions) throws IOException { */ public boolean isPDF() { if (this.isPDF == null) { - this.isPDF = getPdfInputOperator().isPDF(this.file); + this.isPDF = getPDFInputOperator().isPDF(this.file); } return this.isPDF; } - public LocalInputSource compress( - Integer quality, + public void compress( + int quality, Integer maxWidth, Integer maxHeight, Boolean forceSourceText, Boolean disableSourceText ) throws IOException { if (isPDF()) { - this.file = getPdfCompressor() + this.file = getPDFCompressor() .compressPDF(this.file, quality, forceSourceText, disableSourceText); } else { this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight); } - return this; } - public LocalInputSource compress( - Integer quality, + public void compress( + int quality, Integer maxWidth, Integer maxHeight, Boolean forceSourceText ) throws IOException { - return this.compress(quality, maxWidth, maxHeight, forceSourceText, true); + this.compress(quality, maxWidth, maxHeight, forceSourceText, true); } - public LocalInputSource compress( + public void compress( int quality, boolean forceSourceText, boolean disableSourceText ) throws IOException { - return this.compress(quality, null, null, forceSourceText, disableSourceText); - } - - public LocalInputSource compress( - Integer quality, - Integer maxWidth, - Integer maxHeight - ) throws IOException { - return this.compress(quality, maxWidth, maxHeight, false, true); + this.compress(quality, null, null, forceSourceText, disableSourceText); } - public LocalInputSource compress(Integer quality, Integer maxWidth) throws IOException { - return this.compress(quality, maxWidth, null, false, true); + public void compress(int quality, Integer maxWidth, Integer maxHeight) throws IOException { + this.compress(quality, maxWidth, maxHeight, false, true); } - public LocalInputSource compress(Integer quality) throws IOException { - return this.compress(quality, null, null, false, true); + public void compress(int quality) throws IOException { + this.compress(quality, null, null, false, true); } - public LocalInputSource compress() throws IOException { - return this.compress(85, null, null, false, true); + public void compress() throws IOException { + this.compress(85, null, null, false, true); } } diff --git a/src/test/java/com/mindee/input/FileCompressionTest.java b/src/test/java/com/mindee/input/FileCompressionTest.java index a8ef329d1..0047c8cfa 100644 --- a/src/test/java/com/mindee/input/FileCompressionTest.java +++ b/src/test/java/com/mindee/input/FileCompressionTest.java @@ -326,8 +326,8 @@ public void testPdfResizeWithTextKeepsText() throws IOException { var originalDoc = Loader.loadPDF(initialWithText.getFile()); - var compressedWithText = initialWithText.compress(100, true, false).getFile(); - var compressedDoc = Loader.loadPDF(compressedWithText); + initialWithText.compress(100, true, false); + var compressedDoc = Loader.loadPDF(initialWithText.getFile()); Assertions.assertEquals(originalDoc.getNumberOfPages(), compressedDoc.getNumberOfPages()); Assertions.assertNotEquals(originalDoc.hashCode(), compressedDoc.hashCode());