Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 9 additions & 34 deletions src/main/java/com/mindee/image/ImageExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@
import java.util.ArrayList;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

/**
* Extract sub-images from an image.
Expand All @@ -30,7 +25,7 @@ public ImageExtractor(LocalInputSource source) throws IOException {

if (source.isPDF()) {
this.saveFormat = "jpg";
var pdfPageImages = pdfToImages(source.getFile(), source.getFilename());
var pdfPageImages = getPDFRasterizer().PDFToImages(source.getFile(), source.getFilename());
for (PDFPageImage pdfPageImage : pdfPageImages) {
this.pageImages.add(pdfPageImage.getImage());
}
Expand All @@ -45,34 +40,14 @@ public ImageExtractor(LocalInputSource source) throws IOException {
}
}

private List<PDFPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
PDDocument document = Loader.loadPDF(fileBytes);
var pdfRenderer = new PDFRenderer(document);
List<PDFPageImage> pdfPageImages = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg"));
}
document.close();
return pdfPageImages;
}

private BufferedImage pdfPageToImageBuffer(
int index,
PDDocument document,
PDFRenderer pdfRenderer
) throws IOException {
PDRectangle bbox = document.getPage(index).getBBox();
float dimension = bbox.getWidth() * bbox.getHeight();
int dpi;
if (dimension < 200000) {
dpi = 300;
} else if (dimension < 300000) {
dpi = 250;
} else {
dpi = 200;
}
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
/**
* Get the PDF rasterization implementation.
* Override this method to provide custom PDF rasterization handling.
*
* @return The PDF rasterization implementation.
*/
protected PDFRasterization getPDFRasterizer() {
return new PDFRasterizer();
}

/**
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/com/mindee/image/PDFRasterization.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.mindee.image;

import java.io.IOException;
import java.util.List;

/**
* Rasterize a PDF into images.
*/
public interface PDFRasterization {
/**
* Rasterize a PDF into a list of images, one image per page.
*/
List<PDFPageImage> PDFToImages(byte[] fileBytes, String filename) throws IOException;
}
46 changes: 46 additions & 0 deletions src/main/java/com/mindee/image/PDFRasterizer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.mindee.image;

import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

/**
* Default PDF rasterization implementation.
*/
public class PDFRasterizer implements PDFRasterization {
public List<PDFPageImage> PDFToImages(byte[] fileBytes, String filename) throws IOException {
PDDocument document = Loader.loadPDF(fileBytes);
var pdfRenderer = new PDFRenderer(document);
List<PDFPageImage> pdfPageImages = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg"));
}
document.close();
return pdfPageImages;
}

private BufferedImage pdfPageToImageBuffer(
int index,
PDDocument document,
PDFRenderer pdfRenderer
) throws IOException {
PDRectangle bbox = document.getPage(index).getBBox();
float dimension = bbox.getWidth() * bbox.getHeight();
int dpi;
if (dimension < 200000) {
dpi = 300;
} else if (dimension < 300000) {
dpi = 250;
} else {
dpi = 200;
}
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
}
}
2 changes: 1 addition & 1 deletion src/main/java/com/mindee/input/LocalInputSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public LocalInputSource(String fileAsBase64, String filename) {

/**
* Get the PDFInputOperation instance.
* Override this method to provide custom PDF input operation handling.
* Override this method to provide custom PDF input handling.
*
* @return PDFInputOperation instance
*/
Expand Down
26 changes: 13 additions & 13 deletions src/main/java/com/mindee/pdf/BasePDFExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,19 @@ public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws
return extractedPDFs;
}

/**
* Make a nice filename for the split.
*/
protected String makeFilename(List<Integer> pageNumbers) {
String[] splitName = InputSourceUtils.splitNameStrict(filename);
return splitName[0]
+ String.format("_%3s", pageNumbers.get(0)).replace(" ", "0")
+ "-"
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0")
+ "."
+ splitName[1];
}

/**
* Converts an array to a buffered image.
*
Expand All @@ -95,19 +108,6 @@ private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws I
}
}

/**
* Make a nice filename for the split.
*/
private String makeFilename(List<Integer> pageNumbers) {
String[] splitName = InputSourceUtils.splitNameStrict(filename);
return splitName[0]
+ String.format("_%3s", pageNumbers.get(0)).replace(" ", "0")
+ "-"
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0")
+ "."
+ splitName[1];
}

private static PDPage clonePage(PDPage page) {

COSDictionary pageDict = page.getCOSObject();
Expand Down
9 changes: 6 additions & 3 deletions src/main/java/com/mindee/pdf/PDFCompression.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,26 @@

import java.io.IOException;

/**
* Compress a PDF.
*/
public interface PDFCompression {
byte[] compressPDF(
byte[] fileBytes,
Integer imageQuality,
int imageQuality,
Boolean forceSourceTextCompression,
Boolean disableSourceText
) throws IOException;

default byte[] compressPDF(
byte[] fileBytes,
Integer imageQuality,
int imageQuality,
Boolean forceSourceTextCompression
) throws IOException {
return compressPDF(fileBytes, imageQuality, forceSourceTextCompression, true);
}

default byte[] compressPDF(byte[] fileBytes, Integer imageQuality) throws IOException {
default byte[] compressPDF(byte[] fileBytes, int imageQuality) throws IOException {
return compressPDF(fileBytes, imageQuality, false, true);
}

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/mindee/pdf/PDFCompressor.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public PDFCompressor() {
@Override
public byte[] compressPDF(
byte[] fileBytes,
Integer imageQuality,
int imageQuality,
Boolean forceSourceTextCompression,
Boolean disableSourceText
) throws IOException {
Comment thread
sebastianMindee marked this conversation as resolved.
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/com/mindee/pdf/PDFInputOperation.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import com.mindee.input.PageOptions;
import java.io.IOException;

/**
* Various operations required for PDF input files.
*/
public interface PDFInputOperation {

/**
Expand Down
Loading