Skip to content

Commit e88c976

Browse files
committed
Addes sample ExtractHighlightedText
1 parent 7febf70 commit e88c976

2 files changed

Lines changed: 189 additions & 0 deletions

File tree

src/ExtractHighlightedText.hpp

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
////////////////////////////////////////////////////////////////////////////////////////////////////
2+
// ExtractHighlightedText.h
3+
// Copyright (c) 2018 Pdfix. All Rights Reserved.
4+
////////////////////////////////////////////////////////////////////////////////////////////////////
5+
/*!
6+
\page CPP_Samples C++ Samples
7+
- \subpage ExtractHighlightedText_cpp
8+
*/
9+
/*!
10+
\page ExtractHighlightedText_cpp Pdf To Text Sample
11+
Example how to extract tables from a PDF document and save them to csv format.
12+
\snippet /ExtractHighlightedText.hpp ExtractHighlightedText_cpp
13+
*/
14+
15+
#pragma once
16+
17+
//! [ExtractHighlightedText_cpp]
18+
#include <string>
19+
#include <iostream>
20+
#include <sstream>
21+
#include "Pdfix.h"
22+
23+
extern std::string ToUtf8(const std::wstring& wstr);
24+
25+
// HasHighlight rerturns true if there is an highlight annotation over the char_rect
26+
bool HasHighlight(PdfPage* page, PdfRect& char_rect) {
27+
// deflate char rect to minimal
28+
char_rect.left += (char_rect.right - char_rect.left) / 2.;
29+
char_rect.right = char_rect.left + 1;
30+
char_rect.bottom += (char_rect.top - char_rect.bottom) / 2.;
31+
char_rect.top = char_rect.bottom + 1;
32+
33+
// get annotations over the bbox
34+
int num_annots = page->GetNumAnnotsAtRect(&char_rect);
35+
for (int i = 0; i < num_annots; i++) {
36+
PdfAnnot* annot = page->GetAnnotAtRect(&char_rect, i);
37+
if (annot) {
38+
PdfAnnotSubtype subtype = annot->GetSubtype();
39+
if (subtype == kAnnotHighlight)
40+
return true;
41+
}
42+
}
43+
return false;
44+
}
45+
46+
// GetHighlightedText processes each element recursively.
47+
// If the element is a highlighted text, saves it to the output stream.
48+
void GetHighlightedText(PdfPage* page, PdeElement* element, std::stringstream& ss) {
49+
PdfElementType elem_type = element->GetType();
50+
if (elem_type == kPdeText) {
51+
PdeText* text_elem = static_cast<PdeText*>(element);
52+
std::string text;
53+
54+
int num_lines = text_elem->GetNumTextLines();
55+
for (int l = 0; l < num_lines; l++) {
56+
PdeTextLine* line = text_elem->GetTextLine(l);
57+
if (!line)
58+
return;
59+
// if line is bullet or newline, write a new line
60+
if ((line->GetFlags() & (kTextLineBullet | kTextLineNewLine)) != 0 && l > 0) {
61+
// write text into the output stream
62+
if (text.size() > 0) {
63+
ss << text;
64+
ss << std::endl;
65+
text = "";
66+
}
67+
}
68+
69+
int num_words = line->GetNumWords();
70+
for (int w = 0; w < num_words; w++) {
71+
PdeWord* word = line->GetWord(w);
72+
if (!word)
73+
return;
74+
// iterate through each character
75+
int length = word->GetNumChars();
76+
for (int i = 0; i < length; i++) {
77+
PdfRect char_bbox;
78+
word->GetCharBBox(i, &char_bbox);
79+
80+
// add text only if there is a highlight over it
81+
if (HasHighlight(page, char_bbox)) {
82+
std::wstring char_str;
83+
char_str.resize(word->GetCharText(i, nullptr, 0));
84+
word->GetCharText(i, (wchar_t*)char_str.c_str(), char_str.size());
85+
text += ToUtf8(char_str);
86+
}
87+
}
88+
// add whitespace between words
89+
if (text.size() > 0)
90+
text += " ";
91+
}
92+
// add whitespace between lines
93+
if ((l != num_lines - 1) && (text.size() > 0))
94+
text += " ";
95+
}
96+
97+
// write text into the output stream
98+
if (text.size() > 0) {
99+
ss << text;
100+
ss << std::endl;
101+
text = "";
102+
}
103+
}
104+
else {
105+
// process children
106+
int count = element->GetNumChildren();
107+
if (count == 0)
108+
return;
109+
for (int i = 0; i < count; i++) {
110+
PdeElement* child = element->GetChild(i);
111+
if (child)
112+
GetHighlightedText(page, child, ss);
113+
}
114+
}
115+
}
116+
117+
// Extracts texts from the document and saves them to TXT format.
118+
void ExtractHighlightedText(
119+
const std::wstring& email, // authorization email
120+
const std::wstring& license_key, // authorization license key
121+
const std::wstring& open_path, // source PDF document
122+
const std::wstring& save_path, // output TXT file
123+
const std::wstring& config_path // configuration file
124+
) {
125+
std::cout << "ExtractHighlightedText " << std::endl;
126+
127+
// initialize Pdfix
128+
if (!Pdfix_init(Pdfix_MODULE_NAME))
129+
throw std::runtime_error("Pdfix initialization fail.");
130+
131+
Pdfix* pdfix = GetPdfix();
132+
if (!pdfix)
133+
throw std::runtime_error("GetPdfix fail");
134+
if (!pdfix->Authorize(email.c_str(), license_key.c_str()))
135+
throw std::runtime_error(pdfix->GetError());
136+
137+
PdfDoc* doc = pdfix->OpenDoc(open_path.c_str(), L"");
138+
if (!doc)
139+
throw std::runtime_error(pdfix->GetError());
140+
141+
std::stringstream ss;
142+
143+
ss << "<!-- PDFix SDK " << pdfix->GetVersionMajor() << "." << pdfix->GetVersionMinor()
144+
<< "." << pdfix->GetVersionPatch()
145+
<< " conversion PDF to TXT. License: http://pdfix.net/terms -->" << std::endl;
146+
147+
auto num_pages = doc->GetNumPages();
148+
for (auto i = 0; i < num_pages; i++) {
149+
std::cout << std::endl;
150+
std::cout << "Processing pages..." << i + 1 << "/" << num_pages;
151+
152+
ss << std::endl << "Page: " << i + 1 << std::endl;
153+
154+
PdfPage* page = doc->AcquirePage(i);
155+
if (!page)
156+
throw std::runtime_error(pdfix->GetError());
157+
PdePageMap* page_map = page->AcquirePageMap(nullptr, nullptr);
158+
if (!page_map)
159+
throw std::runtime_error(pdfix->GetError());
160+
161+
PdeElement* container = page_map->GetElement();
162+
if (!container)
163+
throw std::runtime_error(pdfix->GetError());
164+
165+
GetHighlightedText(page, container, ss);
166+
167+
doc->ReleasePage(page);
168+
}
169+
std::cout << std::endl;
170+
171+
// write text to file stream
172+
PsFileStream* stream = pdfix->CreateFileStream(save_path.c_str(), kPsWrite);
173+
if (!stream)
174+
throw std::runtime_error(pdfix->GetError());
175+
stream->Write(stream->GetPos(),
176+
(const uint8_t*)ss.str().c_str(), ss.str().length());
177+
stream->Destroy();
178+
179+
// destroy variables
180+
doc->Close();
181+
pdfix->Destroy();
182+
}
183+
184+
//! [ExtractHighlightedText_cpp]

src/Main.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "EmbedFonts.hpp"
2020
#include "ExportFormFieldValues.hpp"
2121
#include "ExtractText.hpp"
22+
#include "ExtractHighlightedText.hpp"
2223
#include "ExtractImages.hpp"
2324
#include "ExtractTables.hpp"
2425
#include "FlattenAnnots.hpp"
@@ -84,6 +85,10 @@ int main()
8485
ExportFormFieldValues(email, key, open_path,
8586
output_dir + L"/ExportFormFieldValues.txt");
8687

88+
std::cout << "ExtractHighlightedText" << std::endl;
89+
ExtractHighlightedText(email, key, open_path, output_dir + L"/ExtractHighlightedText.txt",
90+
config_path);
91+
8792
std::cout << "ExtractImages" << std::endl;
8893
PdfImageParams image_params;
8994
ExtractImages(email, key, open_path, output_dir + L"/", 800,

0 commit comments

Comments
 (0)