Skip to content

Commit 1e5a858

Browse files
✨ add support for crop & split operations (#457)
1 parent ff4866f commit 1e5a858

23 files changed

Lines changed: 929 additions & 105 deletions

src/image/extractedImage.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,15 @@ import { loadOptionalDependency } from "@/dependency/index.js";
1515
*/
1616
export class ExtractedImage {
1717
public buffer: Buffer;
18-
protected internalFileName: string;
18+
public filename: string;
19+
public pageId?: number;
20+
public elementId?: number;
1921

20-
protected constructor(buffer: Uint8Array, fileName: string) {
22+
constructor(buffer: Uint8Array, fileName: string, pageId?: number, elementId?: number) {
2123
this.buffer = Buffer.from(buffer);
22-
this.internalFileName = fileName;
24+
this.filename = fileName;
25+
this.pageId = pageId;
26+
this.elementId = elementId;
2327
}
2428

2529
/**
@@ -104,7 +108,7 @@ export class ExtractedImage {
104108
asSource(): BufferInput {
105109
return new BufferInput({
106110
buffer: this.buffer,
107-
filename: this.internalFileName,
111+
filename: this.filename,
108112
});
109113
}
110114
}

src/image/imageExtractor.ts

Lines changed: 69 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,107 @@
1+
import { loadOptionalDependency } from "@/dependency/index.js";
2+
import { MindeeImageError } from "@/errors/index.js";
3+
import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js";
4+
import { adjustForRotation } from "@/geometry/polygonUtils.js";
5+
import { ExtractedImage } from "@/image/extractedImage.js";
6+
import { LocalInputSource } from "@/input/index.js";
7+
import { logger } from "@/logger.js";
8+
import { createPdfFromInputSource } from "@/pdf/pdfOperation.js";
9+
import { rasterizePage } from "@/pdf/pdfUtils.js";
110
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
211
// @ts-ignore
312
import type * as pdfLibTypes from "@cantoo/pdf-lib";
4-
import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js";
5-
import { adjustForRotation } from "@/geometry/polygonUtils.js";
6-
import { loadOptionalDependency } from "@/dependency/index.js";
713

814
let pdfLib: typeof pdfLibTypes | null = null;
915

1016
async function getPdfLib(): Promise<typeof pdfLibTypes> {
1117
if (!pdfLib) {
1218
const pdfLibImport = await loadOptionalDependency<typeof pdfLibTypes>(
13-
"@cantoo/pdf-lib", "Text Embedding"
19+
"@cantoo/pdf-lib", "Image Extraction"
1420
);
1521
pdfLib = (pdfLibImport as any).default || pdfLibImport;
1622
}
1723
return pdfLib!;
1824
}
1925

26+
27+
/**
28+
* Extracts elements from a PDF document based on a list of bounding boxes.
29+
* @param inputSource The input source to extract from.
30+
* @param polygonsPerPage List of polygons to extract from per page.
31+
* @param quality JPEG quality of extracted images.
32+
*/
33+
export async function extractImagesFromPolygon(
34+
inputSource: LocalInputSource,
35+
polygonsPerPage: Map<number, Polygon[]>,
36+
quality?: number
37+
) {
38+
const allExtractedImages: ExtractedImage[] = [];
39+
const pdfDoc = await createPdfFromInputSource(inputSource);
40+
41+
for (const [pageId, polygons] of polygonsPerPage) {
42+
logger.debug(`Extracting images from page ${pageId}`);
43+
const pdfPage = pdfDoc.getPage(pageId);
44+
const extractions = (await extractFromPage(pdfPage, polygons, true, quality));
45+
const extractedImages = extractions.map(
46+
(v, i) => new ExtractedImage(v, inputSource.filename + `_page${pageId}-${i}.jpg`, pageId, i)
47+
);
48+
allExtractedImages.push(...extractedImages);
49+
}
50+
return allExtractedImages;
51+
}
52+
2053
/**
2154
* Extracts elements from a page based off of a list of bounding boxes.
2255
*
2356
* @param pdfPage PDF Page to extract from.
2457
* @param polygons List of coordinates to pull the elements from.
58+
* @param asImage Whether to return the extracted elements as images.
59+
* @param quality JPEG quality of extracted images, given as number between 0 and 1.
2560
*/
2661
export async function extractFromPage(
2762
pdfPage: pdfLibTypes.PDFPage,
28-
polygons: Polygon[]
63+
polygons: Polygon[],
64+
asImage: boolean = false,
65+
quality?: number,
2966
) {
3067
const pdfLib = await getPdfLib();
3168
const { width, height } = pdfPage.getSize();
32-
const extractedElements :Uint8Array[] = [];
33-
// Manual upscale.
34-
// Fixes issues with the OCR.
35-
const qualityScale = 300/72;
69+
const extractedElements: Uint8Array[] = [];
70+
if (quality && (quality < 0)) {
71+
throw new MindeeImageError("Quality must be a number between 0 and 1");
72+
}
73+
if (quality && quality > 1) {
74+
logger.warn("Quality is greater than 1, this operation will apply a manual upscale on the output." +
75+
" Use only if you know what you are doing.");
76+
}
77+
const qualityScale = quality ?? 1;
3678
const orientation = pdfPage.getRotation().angle;
3779

80+
const sourceDoc = pdfPage.doc;
81+
const pageIndex = sourceDoc.getPages().indexOf(pdfPage);
82+
3883
for (const origPolygon of polygons) {
39-
const polygon = adjustForRotation(origPolygon, orientation);
84+
logger.debug(`Extracting image with polygon: ${origPolygon.toString()}`);
4085

4186
const tempPdf = await pdfLib.PDFDocument.create();
4287

88+
const [copiedPage] = await tempPdf.copyPages(sourceDoc, [pageIndex]);
89+
90+
const polygon = adjustForRotation(origPolygon, orientation);
91+
4392
const newWidth = width * (getMinMaxX(polygon).max - getMinMaxX(polygon).min);
4493
const newHeight = height * (getMinMaxY(polygon).max - getMinMaxY(polygon).min);
45-
const cropped = await tempPdf.embedPage(pdfPage, {
94+
95+
const cropped = await tempPdf.embedPage(copiedPage, {
4696
left: getMinMaxX(polygon).min * width,
4797
right: getMinMaxX(polygon).max * width,
4898
top: height - (getMinMaxY(polygon).min * height),
4999
bottom: height - (getMinMaxY(polygon).max * height),
50100
});
51101

52-
// Determine the final page dimensions based on orientation
53102
let finalWidth: number;
54103
let finalHeight: number;
55104
if (orientation === 90 || orientation === 270) {
56-
// For 90/270 rotations, swap width and height
57105
finalWidth = newHeight * qualityScale;
58106
finalHeight = newWidth * qualityScale;
59107
} else {
@@ -62,15 +110,14 @@ export async function extractFromPage(
62110
}
63111

64112
const samplePage = tempPdf.addPage([finalWidth, finalHeight]);
65-
66113
samplePage.drawRectangle({
67114
x: 0,
68115
y: 0,
69116
width: finalWidth,
70117
height: finalHeight,
118+
color: pdfLib.rgb(1, 1, 1),
71119
});
72120

73-
// Draw the cropped page with rotation applied
74121
if (orientation === 0) {
75122
samplePage.drawPage(cropped, {
76123
width: newWidth * qualityScale,
@@ -102,7 +149,13 @@ export async function extractFromPage(
102149
});
103150
}
104151

105-
extractedElements.push(await tempPdf.save());
152+
const pdfBuffer = Buffer.from(await tempPdf.save());
153+
if (asImage) {
154+
extractedElements.push(await rasterizePage(pdfBuffer, 0, 100));
155+
} else {
156+
extractedElements.push(pdfBuffer);
157+
}
106158
}
159+
107160
return extractedElements;
108161
}

src/pdf/extractedPdf.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import path from "node:path";
2+
import { BufferInput, MIMETYPES } from "@/input/index.js";
3+
import { MindeeError } from "@/errors/index.js";
4+
import { Buffer } from "node:buffer";
5+
import { writeFile } from "fs/promises";
6+
import { logger } from "@/logger.js";
7+
import { writeFileSync } from "node:fs";
8+
9+
export class ExtractedPdf {
10+
public readonly buffer: Buffer;
11+
public readonly filename: string;
12+
public readonly pageCount: number;
13+
14+
constructor(pdfData: Buffer<ArrayBufferLike>, filename: string, pageCount: number) {
15+
this.buffer = pdfData;
16+
this.filename = filename;
17+
this.pageCount = pageCount;
18+
}
19+
20+
/**
21+
* Saves the document to a file.
22+
*
23+
* @param outputPath Path to save the file to.
24+
*/
25+
async saveToFileAsync(outputPath: string) {
26+
const fileExt = path.extname(outputPath).toLowerCase();
27+
if (fileExt !== ".pdf" && !MIMETYPES.has(fileExt)) {
28+
outputPath += ".pdf";
29+
}
30+
31+
try {
32+
await writeFile(path.resolve(outputPath), this.buffer);
33+
logger.info(`File saved successfully to ${path.resolve(outputPath)}.`);
34+
} catch (e) {
35+
if (e instanceof TypeError) {
36+
throw new MindeeError("Invalid path/filename provided.");
37+
} else {
38+
throw e;
39+
}
40+
}
41+
}
42+
43+
/**
44+
* Saves the document to a file synchronously.
45+
* @param outputPath
46+
*/
47+
saveToFile(outputPath: string){
48+
try {
49+
writeFileSync(path.resolve(outputPath), this.buffer);
50+
logger.info(`File saved successfully to ${path.resolve(outputPath)}.`);
51+
} catch (e) {
52+
if (e instanceof TypeError) {
53+
throw new MindeeError("Invalid path/filename provided.");
54+
} else {
55+
throw e;
56+
}
57+
}
58+
}
59+
60+
/**
61+
* Return the file as a Mindee-compatible BufferInput source.
62+
*
63+
* @returns A BufferInput source.
64+
*/
65+
asSource(): BufferInput {
66+
return new BufferInput({
67+
buffer: this.buffer,
68+
filename: this.filename,
69+
});
70+
}
71+
}

src/pdf/pdfCompressor.ts

Lines changed: 2 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,10 @@
11
import { logger } from "@/logger.js";
2-
import tmp from "tmp";
3-
import * as fs from "node:fs";
4-
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
5-
// @ts-ignore
6-
import type * as popplerTypes from "node-poppler";
72
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
83
// @ts-ignore
94
import type * as pdfLibTypes from "@cantoo/pdf-lib";
105
import { compressImage } from "@/image/index.js";
116
import { loadOptionalDependency } from "@/dependency/index.js";
12-
import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText } from "./pdfUtils.js";
7+
import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText, rasterizePage } from "./pdfUtils.js";
138

149
let pdfLib: typeof pdfLibTypes | null = null;
1510

@@ -159,7 +154,7 @@ async function compressPagesWithQuality(
159154
const page = pdfDoc.getPages()[i];
160155
const rasterizedPage = await rasterizePage(pdfData, i + 1, imageQuality);
161156
const compressedImage = await compressImage(
162-
Buffer.from(rasterizedPage, "binary"), imageQuality
157+
rasterizedPage, imageQuality
163158
);
164159
if (!disableSourceText) {
165160
await addTextToPdfPage(page, extractedText);
@@ -260,48 +255,6 @@ async function getFontFromName(fontName: string): Promise<pdfLibTypes.PDFFont> {
260255
return font;
261256
}
262257

263-
/**
264-
* Rasterizes a PDF page.
265-
*
266-
* @param pdfData Buffer representation of the entire PDF file.
267-
* @param index Index of the page to rasterize.
268-
* @param quality Quality to apply during rasterization.
269-
*/
270-
async function rasterizePage(
271-
pdfData: Buffer, index: number, quality = 85
272-
): Promise<string> {
273-
const popplerImport = await loadOptionalDependency<typeof popplerTypes>(
274-
"node-poppler", "Image Processing"
275-
);
276-
const poppler = (popplerImport as any).default || popplerImport;
277-
const popplerInstance = new poppler.Poppler();
278-
const tmpPdf = tmp.fileSync();
279-
const tempPdfPath = tmpPdf.name;
280-
const antialiasOption: "fast" | "best" | "default" | "good" | "gray" | "none" | "subpixel" = "best";
281-
try {
282-
await fs.promises.writeFile(tempPdfPath, pdfData);
283-
const options = {
284-
antialias: antialiasOption,
285-
firstPageToConvert: index,
286-
lastPageToConvert: index,
287-
jpegFile: true,
288-
jpegOptions: `quality=${quality}`,
289-
singleFile: true
290-
};
291-
292-
const jpegBuffer = await popplerInstance.pdfToCairo(tempPdfPath, undefined, options);
293-
294-
await fs.promises.unlink(tempPdfPath);
295-
296-
return jpegBuffer;
297-
} catch (error) {
298-
logger.error("Error rasterizing PDF:", error);
299-
throw error;
300-
} finally {
301-
tmpPdf.removeCallback();
302-
}
303-
}
304-
305258
/**
306259
* Performs linear interpolation between two numbers.
307260
* @param start The starting value.

0 commit comments

Comments
 (0)