@@ -63,16 +63,59 @@ public class PdfContentTextLocator extends PdfContentStreamHandler {
6363 private final ArrayList <Float > fragmentsWidths = new ArrayList <>();
6464
6565 private final int page ;
66- private final Pattern p ;
67-
66+ private Pattern p ;
67+ private float [] coordinates ;
68+ private final int mode ;
6869
70+ /**
71+ * Construct a content PdfContetStreamHandler for regex-based text extraction pattern
72+ *
73+ * @param renderListener the text assembler
74+ * @param pattern the pattern to match text against
75+ * @param page PdfPage to inspect
76+ */
6977 public PdfContentTextLocator (TextAssembler renderListener , String pattern , int page ) {
7078 super (renderListener );
71- if (pattern == null ) throw new IllegalArgumentException ("Pattern cannot be null" );
79+ if (pattern == null ) {
80+ throw new IllegalArgumentException ("Pattern cannot be null" );
81+ }
7282 //We check for length because we want to include whitespaces as possible patterns
73- if (pattern .isEmpty ()) throw new IllegalArgumentException ("Pattern sequence must be longer than 0" );
83+ if (pattern .isEmpty ()) {
84+ throw new IllegalArgumentException ("Pattern sequence must be longer than 0" );
85+ }
7486 this .p = Pattern .compile (pattern );
7587 this .page = page ;
88+ this .mode = 1 ;
89+ installDefaultOperators ();
90+ reset ();
91+ }
92+
93+ /**
94+ * Construct a content PdfContetStreamHandler for coordinates-based text extraction pattern
95+ *
96+ * @param renderListener the text assembler
97+ * @param coordinates the bounding box to search text within
98+ * @param page PdfPage to inspect
99+ */
100+ public PdfContentTextLocator (TextAssembler renderListener , float [] coordinates , int page ) {
101+ super (renderListener );
102+ if (coordinates .length != 4 ) {
103+ throw new IllegalArgumentException ("Coordinates bounding box must be an array of "
104+ + "four floats, "
105+ + "[x1, y1, x2, y2] {lower left point, upper right point}" );
106+ }
107+ if (coordinates [2 ] < coordinates [0 ]) {
108+ throw new IllegalArgumentException ("x2 {coordinates[2]} must be greater than or equal to x1 "
109+ + "{coordinates[0]}" );
110+ }
111+ if (coordinates [3 ] < coordinates [1 ]) {
112+ throw new IllegalArgumentException ("y2 {coordinates[3]} must be greater than or equal to y1 "
113+ + "{coordinates[1]}" );
114+ }
115+ this .coordinates = coordinates ;
116+ //We check for length because we want to include whitespaces as possible patterns
117+ this .page = page ;
118+ this .mode = 2 ;
76119 installDefaultOperators ();
77120 reset ();
78121 }
@@ -103,13 +146,12 @@ public void reset() {
103146 }
104147
105148 /**
106- * Search for a pattern in a PdfString
107- * and if found, collect its bounding box
149+ * Extract a PdfString content and coordinates based on the handler extraction pattern: either matches a given regex
150+ * or intersects a given bounding box
108151 *
109152 * @param string the text to inspect
110153 */
111154 void displayPdfString (PdfString string ) {
112-
113155 String decoded ;
114156 byte [] bytes ;
115157 if (BaseFont .IDENTITY_H .equals (graphicsState ().getFont ().getEncoding ())) {
@@ -135,20 +177,72 @@ void displayPdfString(PdfString string) {
135177 counter ++;
136178 }
137179
138- float pdfStringWidth = startWidth + totalWidth ;
139180 float y = new Vector (0 , 0 , 1f ).cross (textMatrix ).get (1 );
140- float y1 = y + graphicsState ().getFontDescentDescriptor ();
141- float y2 = y + graphicsState ().getFontAscentDescriptor ();
181+ float fontFloor = y + graphicsState ().getFontDescentDescriptor ();
182+ float fontCeiling = y + graphicsState ().getFontAscentDescriptor ();
183+
184+ switch (this .mode ) {
185+ case 1 : {
186+ matchPdfString (decoded , widths , totalWidth , fontFloor , fontCeiling );
187+ break ;
188+ }
189+ case 2 : {
190+ locatePdfString (decoded , startWidth , totalWidth , fontFloor , fontCeiling );
191+ break ;
192+ }
193+ default : {
194+ //do nothing for now
195+ }
196+ }
197+ }
142198
199+ /**
200+ * Search for a pattern in a PdfString and if found, collect its bounding box
201+ *
202+ * @param decoded the text to inspect
203+ * @param widths array of prefix widths of each char
204+ * @param totalWidth width of the text
205+ * @param fontFloor lowest y-coordinate of the font
206+ * @param fontCeiling highest y-coordinate of the font
207+ */
208+ private void matchPdfString (String decoded , float [] widths , float totalWidth , float fontFloor , float fontCeiling ) {
143209 Matcher m = p .matcher (decoded );
144210 while (m .find ()) {
145211 float x1 = widths [m .start ()];
146212 float x2 = widths [m .end ()];
147- MatchedPattern mp = new MatchedPattern (decoded , this .page , x1 , y1 , x2 , y2 );
213+ MatchedPattern mp = new MatchedPattern (decoded , this .page , x1 , fontFloor , x2 , fontCeiling );
148214 accumulator .add (mp );
149215 }
216+ textMatrix = new Matrix (totalWidth , 0 ).multiply (textMatrix );
217+ }
150218
219+ /**
220+ * Extract text if it's coordinates intersect with the given bounding box
221+ *
222+ * @param decoded the text to inspect
223+ * @param startWidth left-most x-coordinate of the text
224+ * @param totalWidth width of the text
225+ * @param fontFloor lowest y-coordinate of the font
226+ * @param fontCeiling highest y-coordinate of the font
227+ */
228+ private void locatePdfString (String decoded , float startWidth , float totalWidth , float fontFloor ,
229+ float fontCeiling ) {
230+ float endWidth = startWidth + totalWidth ;
151231 textMatrix = new Matrix (totalWidth , 0 ).multiply (textMatrix );
232+ if (startWidth < this .coordinates [0 ] && endWidth < this .coordinates [0 ]) {
233+ return ;
234+ }
235+ if (startWidth > this .coordinates [2 ]) {
236+ return ;
237+ }
238+ if (fontFloor < this .coordinates [1 ] && fontCeiling < this .coordinates [1 ]) {
239+ return ;
240+ }
241+ if (fontFloor > this .coordinates [3 ]) {
242+ return ;
243+ }
244+ MatchedPattern mp = new MatchedPattern (decoded , this .page , startWidth , fontFloor , endWidth , fontCeiling );
245+ accumulator .add (mp );
152246 }
153247
154248 private float convertHeightToUser (float height ) {
0 commit comments