Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/main/java/com/mindee/image/ExtractedImage.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ public class ExtractedImage {
private final BufferedImage image;
private final String filename;
private final String saveFormat;
private final int pageId;

/**
* Default constructor.
Expand All @@ -25,10 +26,11 @@ public class ExtractedImage {
* @param filename Name of the extracted image.
* @param saveFormat Format to save the image as, defaults to PNG.
*/
public ExtractedImage(BufferedImage image, String filename, String saveFormat) {
public ExtractedImage(BufferedImage image, String filename, String saveFormat, int pageId) {
this.image = image;
this.filename = filename;
this.saveFormat = saveFormat;
this.pageId = pageId;
}

/**
Expand Down
17 changes: 17 additions & 0 deletions src/main/java/com/mindee/image/ExtractedImages.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package com.mindee.image;

import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;

public class ExtractedImages extends ArrayList<ExtractedImage> {
public void saveAllToDisk(String outputPath) throws IOException {
saveAllToDisk(Path.of(outputPath));
}

public void saveAllToDisk(Path outputPath) throws IOException {
for (ExtractedImage image : this) {
image.writeToFile(outputPath);
}
}
}
19 changes: 11 additions & 8 deletions src/main/java/com/mindee/image/ImageExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,18 @@ public class ImageExtractor {
private final String saveFormat;

public ImageExtractor(LocalInputSource source) throws IOException {
this.filename = source.getFilename();

this.pageImages = new ArrayList<>();

if (source.isPDF()) {
this.saveFormat = "jpg";
var pdfPageImages = pdfToImages(source.getFile(), this.filename);
var pdfPageImages = pdfToImages(source.getFile(), source.getFilename());
for (PDFPageImage pdfPageImage : pdfPageImages) {
this.pageImages.add(pdfPageImage.getImage());
}
this.filename = source.getFilename() + "." + this.saveFormat;
} else {
this.filename = source.getFilename();
String[] splitName = InputSourceUtils.splitNameStrict(this.filename);
this.saveFormat = splitName[1].toLowerCase();

Expand All @@ -43,7 +45,7 @@ public ImageExtractor(LocalInputSource source) throws IOException {
}
}

public List<PDFPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
private List<PDFPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
PDDocument document = Loader.loadPDF(fileBytes);
var pdfRenderer = new PDFRenderer(document);
List<PDFPageImage> pdfPageImages = new ArrayList<>();
Expand Down Expand Up @@ -90,7 +92,7 @@ public int getPageCount() {
* @param pageIndex The page index to extract, begins at 0.
* @return A list of {@link ExtractedImage}.
*/
public <FieldT extends PositionDataField> List<ExtractedImage> extractImagesFromPage(
public <FieldT extends PositionDataField> ExtractedImages extractImagesFromPage(
List<FieldT> fields,
int pageIndex
) {
Expand All @@ -106,7 +108,7 @@ public <FieldT extends PositionDataField> List<ExtractedImage> extractImagesFrom
* @param outputName The base output filename, must have an image extension.
* @return A list of {@link ExtractedImage}.
*/
public <FieldT extends PositionDataField> List<ExtractedImage> extractImagesFromPage(
public <FieldT extends PositionDataField> ExtractedImages extractImagesFromPage(
List<FieldT> fields,
int pageIndex,
String outputName
Expand All @@ -121,7 +123,7 @@ public <FieldT extends PositionDataField> List<ExtractedImage> extractImagesFrom
return extractFromPage(fields, pageIndex, filename);
}

private <FieldT extends PositionDataField> List<ExtractedImage> extractFromPage(
private <FieldT extends PositionDataField> ExtractedImages extractFromPage(
List<FieldT> fields,
int pageIndex,
String outputName
Expand All @@ -131,7 +133,7 @@ private <FieldT extends PositionDataField> List<ExtractedImage> extractFromPage(
.format("%s_page-%3s.%s", splitName[0], pageIndex + 1, splitName[1])
.replace(" ", "0");

var extractedImages = new ArrayList<ExtractedImage>();
var extractedImages = new ExtractedImages();
for (int i = 0; i < fields.size(); i++) {
ExtractedImage extractedImage = extractImage(fields.get(i), pageIndex, i + 1, filename);
if (extractedImage != null) {
Expand Down Expand Up @@ -171,7 +173,8 @@ public <FieldT extends PositionDataField> ExtractedImage extractImage(
return new ExtractedImage(
extractImage(polygon.getAsBbox(), pageIndex),
fieldFilename,
saveFormat
saveFormat,
pageIndex
);
}

Expand Down
76 changes: 37 additions & 39 deletions src/main/java/com/mindee/pdf/BasePDFExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
Expand Down Expand Up @@ -56,17 +55,15 @@ public BasePDFExtractor(LocalInputSource source) throws IOException {
}
}

/**
* Converts an array to a buffered image.
*
* @param byteArray Raw byte array.
* @return a valid ImageIO buffer.
* @throws IOException Throws if the file can't be accessed.
*/
public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) {
return ImageIO.read(stream);
public ExtractedPDF extractSinglePage(
List<Integer> pageNumbers,
boolean closeOriginal
) throws IOException {
if (pageNumbers.isEmpty()) {
throw new MindeeException("Empty indexes not allowed for extraction.");
}
var pdfBytes = createPdfFromExistingPdf(this.sourcePdf, pageNumbers, closeOriginal);
return new ExtractedPDF(pdfBytes, makeFilename(pageNumbers));
}

/**
Expand All @@ -76,32 +73,41 @@ public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IO
* @return A list of extracted files.
* @throws IOException Throws if the file can't be accessed.
*/
public List<ExtractedPDF> extractSubDocuments(
List<List<Integer>> pageIndexes
) throws IOException {
var extractedPDFs = new ArrayList<ExtractedPDF>();
public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws IOException {
var extractedPDFs = new ExtractedPDFs();

for (List<Integer> pageIndexElement : pageIndexes) {
if (pageIndexElement.isEmpty()) {
throw new MindeeException("Empty indexes not allowed for extraction.");
}
String[] splitName = InputSourceUtils.splitNameStrict(filename);
String fieldFilename = splitName[0]
+ String.format("_%3s", pageIndexElement.get(0) + 1).replace(" ", "0")
+ "-"
+ String
.format("%3s", pageIndexElement.get(pageIndexElement.size() - 1) + 1)
.replace(" ", "0")
+ "."
+ splitName[1];
extractedPDFs
.add(
new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename)
);
extractedPDFs.add(extractSinglePage(pageIndexElement, false));
}
return extractedPDFs;
}

/**
* Converts an array to a buffered image.
*
* @param byteArray Raw byte array.
* @return a valid ImageIO buffer.
* @throws IOException Throws if the file can't be accessed.
*/
private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IOException {
try (ByteArrayInputStream stream = new ByteArrayInputStream(byteArray)) {
return ImageIO.read(stream);
}
}

/**
* Make a nice filename for the split.
*/
private String makeFilename(List<Integer> pageNumbers) {
String[] splitName = InputSourceUtils.splitNameStrict(filename);
return splitName[0]
+ String.format("_%3s", pageNumbers.get(0)).replace(" ", "0")
+ "-"
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0")
+ "."
+ splitName[1];
}

private static PDPage clonePage(PDPage page) {

COSDictionary pageDict = page.getCOSObject();
Expand Down Expand Up @@ -135,12 +141,4 @@ private static byte[] createPdfFromExistingPdf(
outputStream.close();
return output;
}

public byte[] mergePdfPages(
PDDocument document,
List<Integer> pageNumbers,
boolean closeOriginal
) throws IOException {
return createPdfFromExistingPdf(document, pageNumbers, closeOriginal);
}
}
6 changes: 6 additions & 0 deletions src/main/java/com/mindee/pdf/ExtractedPDFs.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package com.mindee.pdf;

import java.util.ArrayList;

public class ExtractedPDFs extends ArrayList<ExtractedPDF> {
}
35 changes: 35 additions & 0 deletions src/main/java/com/mindee/v2/fileOperations/Crop.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.mindee.v2.fileOperations;

import com.mindee.image.ExtractedImage;
import com.mindee.image.ExtractedImages;
import com.mindee.image.ImageExtractor;
import com.mindee.input.LocalInputSource;
import com.mindee.v2.product.crop.CropItem;
import java.io.IOException;
import java.util.List;

public class Crop {
private final ImageExtractor imageExtractor;

public Crop(LocalInputSource inputSource) throws IOException {
this.imageExtractor = new ImageExtractor(inputSource);
}

public ExtractedImage extractSingle(CropItem cropItem) throws IOException {
return this.imageExtractor
.extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), 0);
}

public ExtractedImages extractMultiple(List<CropItem> cropItems) {
var extractedImages = new ExtractedImages();
for (int i = 0; i < cropItems.size(); i++) {
var cropItem = cropItems.get(i);
extractedImages
.add(
this.imageExtractor
.extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), i + 1)
);
}
return extractedImages;
}
}
29 changes: 29 additions & 0 deletions src/main/java/com/mindee/v2/fileOperations/Split.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package com.mindee.v2.fileOperations;

import com.mindee.input.LocalInputSource;
import com.mindee.pdf.BasePDFExtractor;
import com.mindee.pdf.ExtractedPDF;
import com.mindee.pdf.ExtractedPDFs;
import com.mindee.v2.product.split.SplitRange;
import java.io.IOException;
import java.util.ArrayList;
import java.util.stream.Collectors;

public class Split {
BasePDFExtractor pdfSplitter;

public Split(LocalInputSource inputSource) throws IOException {
this.pdfSplitter = new BasePDFExtractor(inputSource);
}

public ExtractedPDF extractSingle(SplitRange splitRange) throws IOException {
return this.pdfSplitter.extractSinglePage(splitRange.getPageRangeDistinct(), true);
}

public ExtractedPDFs extractMultiple(ArrayList<SplitRange> splitRanges) throws IOException {
return this.pdfSplitter
.extractSubDocuments(
splitRanges.stream().map(SplitRange::getPageRangeDistinct).collect(Collectors.toList())
);
}
}
11 changes: 10 additions & 1 deletion src/main/java/com/mindee/v2/product/split/SplitRange.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.Getter;
Expand All @@ -22,11 +24,18 @@ public class SplitRange {
* indicates the end page.
*/
@JsonProperty("page_range")
public ArrayList<Integer> pageRange;
public List<Integer> pageRange;

/**
* The document type, as identified on given classification values.
*/
@JsonProperty("document_type")
public String documentType;

/**
* Returns a list of distinct page ranges. Useful for extracting pages from a document.
*/
public List<Integer> getPageRangeDistinct() {
return new ArrayList<>(new LinkedHashSet<>(this.pageRange));
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.mindee.v1.fileOperation;
package com.mindee.v1.fileOperations;

import static com.mindee.TestingUtilities.getV1ResourcePath;
import static com.mindee.TestingUtilities.levenshteinRatio;
Expand Down Expand Up @@ -71,8 +71,8 @@ public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedExc
List<ExtractedPDF> extractedPDFsStrict = extractor
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), false);
Assertions.assertEquals(2, extractedPDFsStrict.size());
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(0).getFilename());
Assertions.assertEquals("default_sample_002-002.pdf", extractedPDFsStrict.get(1).getFilename());
Assertions.assertEquals("default_sample_000-000.pdf", extractedPDFsStrict.get(0).getFilename());
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(1).getFilename());

PredictResponse<InvoiceV4> invoice0 = getInvoicePrediction(
extractedPDFsStrict.get(0).asInputSource()
Expand Down
10 changes: 5 additions & 5 deletions src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException {
var extractedPDFSNoStrict = extractor
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), false);
Assertions.assertEquals(3, extractedPDFSNoStrict.size());
Assertions.assertEquals("invoice_5p_001-001.pdf", extractedPDFSNoStrict.get(0).getFilename());
Assertions.assertEquals("invoice_5p_002-004.pdf", extractedPDFSNoStrict.get(1).getFilename());
Assertions.assertEquals("invoice_5p_005-005.pdf", extractedPDFSNoStrict.get(2).getFilename());
Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFSNoStrict.get(0).getFilename());
Assertions.assertEquals("invoice_5p_001-003.pdf", extractedPDFSNoStrict.get(1).getFilename());
Assertions.assertEquals("invoice_5p_004-004.pdf", extractedPDFSNoStrict.get(2).getFilename());
}

@Test
Expand All @@ -48,7 +48,7 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException {
var extractedPDFStrict = extractor
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), true);
Assertions.assertEquals(2, extractedPDFStrict.size());
Assertions.assertEquals("invoice_5p_001-001.pdf", extractedPDFStrict.get(0).getFilename());
Assertions.assertEquals("invoice_5p_002-005.pdf", extractedPDFStrict.get(1).getFilename());
Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFStrict.get(0).getFilename());
Assertions.assertEquals("invoice_5p_001-004.pdf", extractedPDFStrict.get(1).getFilename());
}
}
Loading
Loading