1+ import { loadOptionalDependency } from "@/dependency/index.js" ;
2+ import { MindeeImageError } from "@/errors/index.js" ;
3+ import { getMinMaxX , getMinMaxY , Polygon } from "@/geometry/index.js" ;
4+ import { adjustForRotation } from "@/geometry/polygonUtils.js" ;
5+ import { ExtractedImage } from "@/image/extractedImage.js" ;
6+ import { LocalInputSource } from "@/input/index.js" ;
7+ import { logger } from "@/logger.js" ;
8+ import { createPdfFromInputSource } from "@/pdf/pdfOperation.js" ;
9+ import { rasterizePage } from "@/pdf/pdfUtils.js" ;
110// eslint-disable-next-line @typescript-eslint/ban-ts-comment
211// @ts -ignore
312import type * as pdfLibTypes from "@cantoo/pdf-lib" ;
4- import { getMinMaxX , getMinMaxY , Polygon } from "@/geometry/index.js" ;
5- import { adjustForRotation } from "@/geometry/polygonUtils.js" ;
6- import { loadOptionalDependency } from "@/dependency/index.js" ;
713
814let pdfLib : typeof pdfLibTypes | null = null ;
915
1016async function getPdfLib ( ) : Promise < typeof pdfLibTypes > {
1117 if ( ! pdfLib ) {
1218 const pdfLibImport = await loadOptionalDependency < typeof pdfLibTypes > (
13- "@cantoo/pdf-lib" , "Text Embedding "
19+ "@cantoo/pdf-lib" , "Image Extraction "
1420 ) ;
1521 pdfLib = ( pdfLibImport as any ) . default || pdfLibImport ;
1622 }
1723 return pdfLib ! ;
1824}
1925
26+
27+ /**
28+ * Extracts elements from a PDF document based on a list of bounding boxes.
29+ * @param inputSource The input source to extract from.
30+ * @param polygonsPerPage List of polygons to extract from per page.
31+ * @param quality JPEG quality of extracted images.
32+ */
33+ export async function extractImagesFromPolygon (
34+ inputSource : LocalInputSource ,
35+ polygonsPerPage : Map < number , Polygon [ ] > ,
36+ quality ?: number
37+ ) {
38+ const allExtractedImages : ExtractedImage [ ] = [ ] ;
39+ const pdfDoc = await createPdfFromInputSource ( inputSource ) ;
40+
41+ for ( const [ pageId , polygons ] of polygonsPerPage ) {
42+ logger . debug ( `Extracting images from page ${ pageId } ` ) ;
43+ const pdfPage = pdfDoc . getPage ( pageId ) ;
44+ const extractions = ( await extractFromPage ( pdfPage , polygons , true , quality ) ) ;
45+ const extractedImages = extractions . map (
46+ ( v , i ) => new ExtractedImage ( v , inputSource . filename + `_page${ pageId } -${ i } .jpg` , pageId , i )
47+ ) ;
48+ allExtractedImages . push ( ...extractedImages ) ;
49+ }
50+ return allExtractedImages ;
51+ }
52+
2053/**
2154 * Extracts elements from a page based off of a list of bounding boxes.
2255 *
2356 * @param pdfPage PDF Page to extract from.
2457 * @param polygons List of coordinates to pull the elements from.
58+ * @param asImage Whether to return the extracted elements as images.
59+ * @param quality JPEG quality of extracted images, given as number between 0 and 1.
2560 */
2661export async function extractFromPage (
2762 pdfPage : pdfLibTypes . PDFPage ,
28- polygons : Polygon [ ]
63+ polygons : Polygon [ ] ,
64+ asImage : boolean = false ,
65+ quality ?: number ,
2966) {
3067 const pdfLib = await getPdfLib ( ) ;
3168 const { width, height } = pdfPage . getSize ( ) ;
32- const extractedElements :Uint8Array [ ] = [ ] ;
33- // Manual upscale.
34- // Fixes issues with the OCR.
35- const qualityScale = 300 / 72 ;
69+ const extractedElements : Uint8Array [ ] = [ ] ;
70+ if ( quality && ( quality < 0 ) ) {
71+ throw new MindeeImageError ( "Quality must be a number between 0 and 1" ) ;
72+ }
73+ if ( quality && quality > 1 ) {
74+ logger . warn ( "Quality is greater than 1, this operation will apply a manual upscale on the output." +
75+ " Use only if you know what you are doing." ) ;
76+ }
77+ const qualityScale = quality ?? 1 ;
3678 const orientation = pdfPage . getRotation ( ) . angle ;
3779
80+ const sourceDoc = pdfPage . doc ;
81+ const pageIndex = sourceDoc . getPages ( ) . indexOf ( pdfPage ) ;
82+
3883 for ( const origPolygon of polygons ) {
39- const polygon = adjustForRotation ( origPolygon , orientation ) ;
84+ logger . debug ( `Extracting image with polygon: ${ origPolygon . toString ( ) } ` ) ;
4085
4186 const tempPdf = await pdfLib . PDFDocument . create ( ) ;
4287
88+ const [ copiedPage ] = await tempPdf . copyPages ( sourceDoc , [ pageIndex ] ) ;
89+
90+ const polygon = adjustForRotation ( origPolygon , orientation ) ;
91+
4392 const newWidth = width * ( getMinMaxX ( polygon ) . max - getMinMaxX ( polygon ) . min ) ;
4493 const newHeight = height * ( getMinMaxY ( polygon ) . max - getMinMaxY ( polygon ) . min ) ;
45- const cropped = await tempPdf . embedPage ( pdfPage , {
94+
95+ const cropped = await tempPdf . embedPage ( copiedPage , {
4696 left : getMinMaxX ( polygon ) . min * width ,
4797 right : getMinMaxX ( polygon ) . max * width ,
4898 top : height - ( getMinMaxY ( polygon ) . min * height ) ,
4999 bottom : height - ( getMinMaxY ( polygon ) . max * height ) ,
50100 } ) ;
51101
52- // Determine the final page dimensions based on orientation
53102 let finalWidth : number ;
54103 let finalHeight : number ;
55104 if ( orientation === 90 || orientation === 270 ) {
56- // For 90/270 rotations, swap width and height
57105 finalWidth = newHeight * qualityScale ;
58106 finalHeight = newWidth * qualityScale ;
59107 } else {
@@ -62,15 +110,14 @@ export async function extractFromPage(
62110 }
63111
64112 const samplePage = tempPdf . addPage ( [ finalWidth , finalHeight ] ) ;
65-
66113 samplePage . drawRectangle ( {
67114 x : 0 ,
68115 y : 0 ,
69116 width : finalWidth ,
70117 height : finalHeight ,
118+ color : pdfLib . rgb ( 1 , 1 , 1 ) ,
71119 } ) ;
72120
73- // Draw the cropped page with rotation applied
74121 if ( orientation === 0 ) {
75122 samplePage . drawPage ( cropped , {
76123 width : newWidth * qualityScale ,
@@ -102,7 +149,13 @@ export async function extractFromPage(
102149 } ) ;
103150 }
104151
105- extractedElements . push ( await tempPdf . save ( ) ) ;
152+ const pdfBuffer = Buffer . from ( await tempPdf . save ( ) ) ;
153+ if ( asImage ) {
154+ extractedElements . push ( await rasterizePage ( pdfBuffer , 0 , 100 ) ) ;
155+ } else {
156+ extractedElements . push ( pdfBuffer ) ;
157+ }
106158 }
159+
107160 return extractedElements ;
108161}
0 commit comments