docsplit/test/unit/test_extract_text.rb at 9967ca797b71e985acc4c7caa292035f7d6a50c6 · documentcloud/docsplit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
here = File.expand_path(File.dirname(__FILE__))
require File.join(here, '..', 'test_helper')
require 'tmpdir'

class ExtractTextTest < Minitest::Test

  def test_paged_extraction
    Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
    assert Dir["#{OUTPUT}/*.txt"].length == 2
    assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
  end

  def test_page_only_extraction
    Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 2..2, :output => OUTPUT)
    assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/obama_arts_2.txt"]
  end

  def test_capitalized_pdf_extraction
    Dir["#{OUTPUT}/*.txt"].each {|previous| FileUtils.rm(previous) }
    Dir.mktmpdir do |dir|
      FileUtils.cp('test/fixtures/obama_arts.pdf', "#{dir}/OBAMA_ARTS.PDF")
      Docsplit.extract_text("#{dir}/OBAMA_ARTS.PDF", :pages => 2..2, :output => OUTPUT)
    end
    assert Dir["#{OUTPUT}/*.txt"] == ["#{OUTPUT}/OBAMA_ARTS_2.txt"]
  end

  def test_unicode_extraction
    Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
    assert Dir["#{OUTPUT}/*.txt"].length == 3
  end

  def test_ocr_extraction
    Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
    4.times do |i|
      file = "corrosion_#{i + 1}.txt"
      assert_directory_contains(OUTPUT, file)
      assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
    end
  end

  def test_ocr_extraction_in_mock_language
    exception = assert_raises(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
    assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
  end

  def test_password_protected
    assert_raises(ExtractionFailed) do
      Docsplit.extract_text('test/fixtures/completely_encrypted.pdf')
    end
  end

  def test_name_escaping_while_extracting_text_into_pages
    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
    assert Dir["#{OUTPUT}/*.txt"].length == 2
  end

  def test_name_escaping_while_extracting_text_using_ocr
    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :ocr => true, :output => OUTPUT)
    assert Dir["#{OUTPUT}/*.txt"].length == 1
  end

  def test_orientation_detected_ocr_extraction
    if Docsplit::DEPENDENCIES[:osd]
      pages = 1..4
      Docsplit.extract_text('test/fixtures/corrosion.reoriented.pdf', :output => OUTPUT, :pages=>pages, :force_ocr => true)
      letters = Hash.new(0)
      nonletters = Hash.new(0)

      pages.each do |number|
        File.open(File.join(OUTPUT,"corrosion.reoriented_#{number}.txt")).each_char do |c|
          case c
          when /[A-Za-z]/
            letters[c] += 1
          when /\s/
          else
            nonletters[c] += 1
          end
        end
      end

      # the corrosion.pdf has 6160 letters & 362 nonletters, or ~17:1
      # so lets give a fudge factor of ~half of that or 8:1
      assert letters.values.reduce(0,:+)/8 > nonletters.values.reduce(0,:+), "Expected that text extracted with orientation detection would have more letters."
    else
      skip "Orientation detection module (osd) for Tesseract isn't installed"
    end
  end

end