-
Notifications
You must be signed in to change notification settings - Fork 214
Expand file tree
/
Copy pathtest_extract_text.rb
More file actions
executable file
·89 lines (75 loc) · 3.41 KB
/
test_extract_text.rb
File metadata and controls
executable file
·89 lines (75 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
here = File.expand_path(File.dirname(__FILE__))
require File.join(here, '..', 'test_helper')
require 'tmpdir'
class ExtractTextTest < Minitest::Test
def test_paged_extraction
Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
end
def test_page_only_extraction
Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 2..2, :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/obama_arts_2.txt"]
end
def test_capitalized_pdf_extraction
Dir["#{OUTPUT}/*.txt"].each {|previous| FileUtils.rm(previous) }
Dir.mktmpdir do |dir|
FileUtils.cp('test/fixtures/obama_arts.pdf', "#{dir}/OBAMA_ARTS.PDF")
Docsplit.extract_text("#{dir}/OBAMA_ARTS.PDF", :pages => 2..2, :output => OUTPUT)
end
assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/OBAMA_ARTS_2.txt"]
end
def test_unicode_extraction
Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 3
end
def test_ocr_extraction
Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
4.times do |i|
file = "corrosion_#{i + 1}.txt"
assert_directory_contains(OUTPUT, file)
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
end
end
def test_ocr_extraction_in_mock_language
exception = assert_raises(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
end
def test_password_protected
assert_raises(ExtractionFailed) do
Docsplit.extract_text('test/fixtures/completely_encrypted.pdf')
end
end
def test_name_escaping_while_extracting_text_into_pages
Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
end
def test_name_escaping_while_extracting_text_using_ocr
Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :ocr => true, :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 1
end
def test_orientation_detected_ocr_extraction
if Docsplit::DEPENDENCIES[:osd]
pages = 1..4
Docsplit.extract_text('test/fixtures/corrosion.reoriented.pdf', :output => OUTPUT, :pages=>pages, :force_ocr => true)
letters = Hash.new(0)
nonletters = Hash.new(0)
pages.each do |number|
File.open(File.join(OUTPUT,"corrosion.reoriented_#{number}.txt")).each_char do |c|
case c
when /[A-Za-z]/
letters[c] += 1
when /\s/
else
nonletters[c] += 1
end
end
end
# the corrosion.pdf has 6160 letters & 362 nonletters, or ~17:1
# so lets give a fudge factor of ~half of that or 8:1
assert letters.values.reduce(0,:+)/8 > nonletters.values.reduce(0,:+), "Expected that text extracted with orientation detection would have more letters."
else
skip "Orientation detection module (osd) for Tesseract isn't installed"
end
end
end