Skip to content

Commit f09c216

Browse files
ConvoluteHumanBotasturio
authored andcommitted
Added coordinates-based text extraction to PdfContetTextLocator.java.
Added tests for the new feature.
1 parent 5e0d88d commit f09c216

4 files changed

Lines changed: 248 additions & 37 deletions

File tree

openpdf-core/src/main/java/org/openpdf/text/pdf/parser/MatchedPattern.java

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,15 @@ public class MatchedPattern {
5353
private final float[] coordinates = new float[4];
5454

5555
/**
56-
* Constructor to pair a strip of text with its bounding box coordinates inside a page.
57-
* The coordinates system has the origin (0, 0) in the lower left point of the page
58-
* and uses PDF points as unit measure.
56+
* Constructor to pair a strip of text with its bounding box coordinates inside a page. The coordinates system has
57+
* the origin (0, 0) in the lower left point of the page and uses PDF points as unit measure.
5958
*
60-
* @param text string
61-
* @param page int
62-
* @param llx float lower left x coordinate
63-
* @param lly float lower left y coordinate
64-
* @param urx float upper right x coordinate
65-
* @param ury float upper right y coordinate
59+
* @param text string
60+
* @param page int
61+
* @param llx float lower left x coordinate
62+
* @param lly float lower left y coordinate
63+
* @param urx float upper right x coordinate
64+
* @param ury float upper right y coordinate
6665
*/
6766
MatchedPattern(String text, int page, float llx, float lly, float urx, float ury) {
6867
this.text = text;
@@ -85,13 +84,21 @@ public float[] getCoordinates() {
8584
return coordinates;
8685
}
8786

87+
public String printCoordinates() {
88+
return "[llx: " + coordinates[0] + ", lly: " + coordinates[1] + ", urx: " + coordinates[2] + ", ury: " + coordinates[3] + "]";
89+
}
90+
8891
@Override
8992
public String toString() {
90-
String[] c = new String[4];
91-
for(int i = 0; i < 4; i++) {
92-
c[i] = String.valueOf(coordinates[i]);
93-
}
94-
return "[" + String.join(", ", c) + "]";
93+
StringBuilder sb = new StringBuilder();
94+
sb.append("Text: [")
95+
.append(this.text)
96+
.append("] - boundingBox: ")
97+
.append(this.printCoordinates())
98+
.append(" - page: [")
99+
.append(this.page)
100+
.append("]");
101+
return sb.toString();
95102
}
96103

97104
}

openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfContentTextLocator.java

Lines changed: 105 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,59 @@ public class PdfContentTextLocator extends PdfContentStreamHandler {
6363
private final ArrayList<Float> fragmentsWidths = new ArrayList<>();
6464

6565
private final int page;
66-
private final Pattern p;
67-
66+
private Pattern p;
67+
private float[] coordinates;
68+
private final int mode;
6869

70+
/**
71+
* Construct a content PdfContetStreamHandler for regex-based text extraction pattern
72+
*
73+
* @param renderListener the text assembler
74+
* @param pattern the pattern to match text against
75+
* @param page PdfPage to inspect
76+
*/
6977
public PdfContentTextLocator(TextAssembler renderListener, String pattern, int page) {
7078
super(renderListener);
71-
if(pattern == null) throw new IllegalArgumentException("Pattern cannot be null");
79+
if (pattern == null) {
80+
throw new IllegalArgumentException("Pattern cannot be null");
81+
}
7282
//We check for length because we want to include whitespaces as possible patterns
73-
if(pattern.isEmpty()) throw new IllegalArgumentException("Pattern sequence must be longer than 0");
83+
if (pattern.isEmpty()) {
84+
throw new IllegalArgumentException("Pattern sequence must be longer than 0");
85+
}
7486
this.p = Pattern.compile(pattern);
7587
this.page = page;
88+
this.mode = 1;
89+
installDefaultOperators();
90+
reset();
91+
}
92+
93+
/**
94+
* Construct a content PdfContetStreamHandler for coordinates-based text extraction pattern
95+
*
96+
* @param renderListener the text assembler
97+
* @param coordinates the bounding box to search text within
98+
* @param page PdfPage to inspect
99+
*/
100+
public PdfContentTextLocator(TextAssembler renderListener, float[] coordinates, int page) {
101+
super(renderListener);
102+
if (coordinates.length != 4) {
103+
throw new IllegalArgumentException("Coordinates bounding box must be an array of "
104+
+ "four floats, "
105+
+ "[x1, y1, x2, y2] {lower left point, upper right point}");
106+
}
107+
if (coordinates[2] < coordinates[0]) {
108+
throw new IllegalArgumentException("x2 {coordinates[2]} must be greater than or equal to x1 "
109+
+ "{coordinates[0]}");
110+
}
111+
if (coordinates[3] < coordinates[1]) {
112+
throw new IllegalArgumentException("y2 {coordinates[3]} must be greater than or equal to y1 "
113+
+ "{coordinates[1]}");
114+
}
115+
this.coordinates = coordinates;
116+
//We check for length because we want to include whitespaces as possible patterns
117+
this.page = page;
118+
this.mode = 2;
76119
installDefaultOperators();
77120
reset();
78121
}
@@ -103,13 +146,12 @@ public void reset() {
103146
}
104147

105148
/**
106-
* Search for a pattern in a PdfString
107-
* and if found, collect its bounding box
149+
* Extract a PdfString content and coordinates based on the handler extraction pattern: either matches a given regex
150+
* or intersects a given bounding box
108151
*
109152
* @param string the text to inspect
110153
*/
111154
void displayPdfString(PdfString string) {
112-
113155
String decoded;
114156
byte[] bytes;
115157
if (BaseFont.IDENTITY_H.equals(graphicsState().getFont().getEncoding())) {
@@ -135,20 +177,72 @@ void displayPdfString(PdfString string) {
135177
counter++;
136178
}
137179

138-
float pdfStringWidth = startWidth + totalWidth;
139180
float y = new Vector(0, 0, 1f).cross(textMatrix).get(1);
140-
float y1 = y + graphicsState().getFontDescentDescriptor();
141-
float y2 = y + graphicsState().getFontAscentDescriptor();
181+
float fontFloor = y + graphicsState().getFontDescentDescriptor();
182+
float fontCeiling = y + graphicsState().getFontAscentDescriptor();
183+
184+
switch (this.mode) {
185+
case 1: {
186+
matchPdfString(decoded, widths, totalWidth, fontFloor, fontCeiling);
187+
break;
188+
}
189+
case 2: {
190+
locatePdfString(decoded, startWidth, totalWidth, fontFloor, fontCeiling);
191+
break;
192+
}
193+
default: {
194+
//do nothing for now
195+
}
196+
}
197+
}
142198

199+
/**
200+
* Search for a pattern in a PdfString and if found, collect its bounding box
201+
*
202+
* @param decoded the text to inspect
203+
* @param widths array of prefix widths of each char
204+
* @param totalWidth width of the text
205+
* @param fontFloor lowest y-coordinate of the font
206+
* @param fontCeiling highest y-coordinate of the font
207+
*/
208+
private void matchPdfString(String decoded, float[] widths, float totalWidth, float fontFloor, float fontCeiling) {
143209
Matcher m = p.matcher(decoded);
144210
while (m.find()) {
145211
float x1 = widths[m.start()];
146212
float x2 = widths[m.end()];
147-
MatchedPattern mp = new MatchedPattern(decoded, this.page, x1, y1, x2, y2);
213+
MatchedPattern mp = new MatchedPattern(decoded, this.page, x1, fontFloor, x2, fontCeiling);
148214
accumulator.add(mp);
149215
}
216+
textMatrix = new Matrix(totalWidth, 0).multiply(textMatrix);
217+
}
150218

219+
/**
220+
* Extract text if it's coordinates intersect with the given bounding box
221+
*
222+
* @param decoded the text to inspect
223+
* @param startWidth left-most x-coordinate of the text
224+
* @param totalWidth width of the text
225+
* @param fontFloor lowest y-coordinate of the font
226+
* @param fontCeiling highest y-coordinate of the font
227+
*/
228+
private void locatePdfString(String decoded, float startWidth, float totalWidth, float fontFloor,
229+
float fontCeiling) {
230+
float endWidth = startWidth + totalWidth;
151231
textMatrix = new Matrix(totalWidth, 0).multiply(textMatrix);
232+
if (startWidth < this.coordinates[0] && endWidth < this.coordinates[0]) {
233+
return;
234+
}
235+
if (startWidth > this.coordinates[2]) {
236+
return;
237+
}
238+
if (fontFloor < this.coordinates[1] && fontCeiling < this.coordinates[1]) {
239+
return;
240+
}
241+
if (fontFloor > this.coordinates[3]) {
242+
return;
243+
}
244+
MatchedPattern mp = new MatchedPattern(decoded, this.page, startWidth, fontFloor, endWidth, fontCeiling);
245+
accumulator.add(mp);
152246
}
153247

154248
private float convertHeightToUser(float height) {

openpdf-core/src/main/java/org/openpdf/text/pdf/parser/PdfTextLocator.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,42 @@ public ArrayList<MatchedPattern> searchFile(String pattern) throws IOException {
177177
return res;
178178
}
179179

180+
/**
181+
* Locates text within a bounding box inside a page
182+
*
183+
* @param page page number we are interested in
184+
* @param coordinates bounding box to extract text from
185+
* @return <CODE>ArrayList<MatchedPattern></CODE> List of matched text patterns with coordinates.
186+
* @throws IOException on error
187+
*/
188+
public ArrayList<MatchedPattern> searchPage(int page, float[] coordinates) throws IOException {
189+
PdfDictionary pageDict = reader.getPageN(page);
190+
if (pageDict == null) {
191+
return new ArrayList<>();
192+
}
193+
PdfDictionary resources = pageDict.getAsDict(PdfName.RESOURCES);
194+
renderListener.reset();
195+
renderListener.setPage(page);
196+
PdfContentTextLocator handler = new PdfContentTextLocator(renderListener, coordinates, page);
197+
processContent(getContentBytesForPage(page), resources, handler);
198+
return handler.getMatchedPatterns();
199+
}
200+
201+
/**
202+
* Locates text within a bounding box inside a PDF
203+
*
204+
* @param coordinates bounding box to extract text from
205+
* @return <CODE>ArrayList<MatchedPattern></CODE> List of matched text patterns with coordinates.
206+
* @throws IOException on error
207+
*/
208+
public ArrayList<MatchedPattern> searchFile(float[] coordinates) throws IOException {
209+
ArrayList<MatchedPattern> res = new ArrayList<>();
210+
for (int page = 1; page <= reader.getNumberOfPages(); page++) {
211+
res.addAll(searchPage(page, coordinates));
212+
}
213+
return res;
214+
}
215+
180216
/**
181217
* Processes PDF syntax
182218
*

0 commit comments

Comments
 (0)