|
3 | 3 | require 'nokogiri' |
4 | 4 | require 'zip' |
5 | 5 |
|
6 | | -module Docx |
7 | | - # The Document class wraps around a docx file and provides methods to |
8 | | - # interface with it. |
9 | | - # |
10 | | - # # get a Docx::Document for a docx file in the local directory |
11 | | - # doc = Docx::Document.open("test.docx") |
12 | | - # |
13 | | - # # get the text from the document |
14 | | - # puts doc.text |
15 | | - # |
16 | | - # # do the same thing in a block |
17 | | - # Docx::Document.open("test.docx") do |d| |
18 | | - # puts d.text |
19 | | - # end |
20 | | - class Document |
21 | | - attr_reader :xml, :doc, :zip, :styles |
22 | | - |
23 | | - def initialize(path, &block) |
24 | | - @replace = {} |
25 | | - @zip = Zip::File.open(path) |
26 | | - @document_xml = @zip.read('word/document.xml') |
27 | | - @doc = Nokogiri::XML(@document_xml) |
28 | | - @styles_xml = @zip.read('word/styles.xml') |
29 | | - @styles = Nokogiri::XML(@styles_xml) |
30 | | - if block_given? |
31 | | - yield self |
32 | | - @zip.close |
33 | | - end |
34 | | - end |
35 | | - |
36 | | - # This stores the current global document properties, for now |
37 | | - def document_properties |
38 | | - { |
39 | | - font_size: font_size |
40 | | - } |
41 | | - end |
42 | | - |
43 | | - # With no associated block, Docx::Document.open is a synonym for Docx::Document.new. If the optional code block is given, it will be passed the opened +docx+ file as an argument and the Docx::Document oject will automatically be closed when the block terminates. The values of the block will be returned from Docx::Document.open. |
44 | | - # call-seq: |
45 | | - # open(filepath) => file |
46 | | - # open(filepath) {|file| block } => obj |
47 | | - def self.open(path, &block) |
48 | | - self.new(path, &block) |
49 | | - end |
50 | | - |
51 | | - def paragraphs |
52 | | - @doc.xpath('//w:document//w:body//w:p').map { |p_node| parse_paragraph_from p_node } |
53 | | - end |
54 | | - |
55 | | - def bookmarks |
56 | | - bkmrks_hsh = Hash.new |
57 | | - bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node } |
58 | | - # auto-generated by office 2010 |
59 | | - bkmrks_ary.reject! {|b| b.name == "_GoBack" } |
60 | | - bkmrks_ary.each {|b| bkmrks_hsh[b.name] = b } |
61 | | - bkmrks_hsh |
62 | | - end |
63 | | - |
64 | | - def tables |
65 | | - @doc.xpath('//w:document//w:body//w:tbl').map { |t_node| parse_table_from t_node } |
66 | | - end |
67 | | - |
68 | | - # Some documents have this set, others don't. |
69 | | - # Values are returned as half-points, so to get points, that's why it's divided by 2. |
70 | | - def font_size |
71 | | - size_tag = @styles.xpath('//w:docDefaults//w:rPrDefault//w:rPr//w:sz').first |
72 | | - size_tag ? size_tag.attributes['val'].value.to_i / 2 : nil |
73 | | - end |
74 | | - |
75 | | - ## |
76 | | - # *Deprecated* |
77 | | - # |
78 | | - # Iterates over paragraphs within document |
79 | | - # call-seq: |
80 | | - # each_paragraph => Enumerator |
81 | | - def each_paragraph |
82 | | - paragraphs.each { |p| yield(p) } |
83 | | - end |
84 | | - |
85 | | - # call-seq: |
86 | | - # to_s -> string |
87 | | - def to_s |
88 | | - paragraphs.map(&:to_s).join("\n") |
89 | | - end |
90 | | - |
91 | | - # Output entire document as a String HTML fragment |
92 | | - def to_html |
93 | | - paragraphs.map(&:to_html).join('\n') |
94 | | - end |
95 | | - |
96 | | - # Save document to provided path |
97 | | - # call-seq: |
98 | | - # save(filepath) => void |
99 | | - def save(path) |
100 | | - update |
101 | | - Zip::OutputStream.open(path) do |out| |
102 | | - zip.each do |entry| |
103 | | - out.put_next_entry(entry.name) |
104 | | - |
105 | | - if @replace[entry.name] |
106 | | - out.write(@replace[entry.name]) |
107 | | - else |
108 | | - out.write(zip.read(entry.name)) |
109 | | - end |
110 | | - end |
111 | | - end |
112 | | - zip.close |
113 | | - end |
114 | | - |
115 | | - alias_method :text, :to_s |
116 | | - |
117 | | - private |
118 | | - |
119 | | - #-- |
120 | | - # TODO: Flesh this out to be compatible with other files |
121 | | - # TODO: Method to set flag on files that have been edited, probably by inserting something at the |
122 | | - # end of methods that make edits? |
123 | | - #++ |
124 | | - def update |
125 | | - @replace["word/document.xml"] = doc.serialize :save_with => 0 |
126 | | - end |
127 | | - |
128 | | - # generate Elements::Containers::Paragraph from paragraph XML node |
129 | | - def parse_paragraph_from(p_node) |
130 | | - Elements::Containers::Paragraph.new(p_node, document_properties) |
131 | | - end |
132 | | - |
133 | | - # generate Elements::Bookmark from bookmark XML node |
134 | | - def parse_bookmark_from(b_node) |
135 | | - Elements::Bookmark.new(b_node) |
136 | | - end |
137 | | - |
138 | | - def parse_table_from(t_node) |
139 | | - Elements::Containers::Table.new(t_node) |
140 | | - end |
141 | | - end |
142 | | -end |
| 6 | +module Docx |
| 7 | + # The Document class wraps around a docx file and provides methods to |
| 8 | + # interface with it. |
| 9 | + # |
| 10 | + # # get a Docx::Document for a docx file in the local directory |
| 11 | + # doc = Docx::Document.open("test.docx") |
| 12 | + # |
| 13 | + # # get the text from the document |
| 14 | + # puts doc.text |
| 15 | + # |
| 16 | + # # do the same thing in a block |
| 17 | + # Docx::Document.open("test.docx") do |d| |
| 18 | + # puts d.text |
| 19 | + # end |
| 20 | + class Document |
| 21 | + attr_reader :xml, :doc, :zip, :styles |
| 22 | + |
| 23 | + def initialize(path, &block) |
| 24 | + @replace = {} |
| 25 | + @zip = Zip::File.open(path) |
| 26 | + @document_xml = @zip.read('word/document.xml') |
| 27 | + @doc = Nokogiri::XML(@document_xml) |
| 28 | + @styles_xml = @zip.read('word/styles.xml') |
| 29 | + @styles = Nokogiri::XML(@styles_xml) |
| 30 | + if block_given? |
| 31 | + yield self |
| 32 | + @zip.close |
| 33 | + end |
| 34 | + end |
| 35 | + |
| 36 | + |
| 37 | + # This stores the current global document properties, for now |
| 38 | + def document_properties |
| 39 | + { |
| 40 | + font_size: font_size |
| 41 | + } |
| 42 | + end |
| 43 | + |
| 44 | + |
| 45 | + # With no associated block, Docx::Document.open is a synonym for Docx::Document.new. If the optional code block is given, it will be passed the opened +docx+ file as an argument and the Docx::Document oject will automatically be closed when the block terminates. The values of the block will be returned from Docx::Document.open. |
| 46 | + # call-seq: |
| 47 | + # open(filepath) => file |
| 48 | + # open(filepath) {|file| block } => obj |
| 49 | + def self.open(path, &block) |
| 50 | + self.new(path, &block) |
| 51 | + end |
| 52 | + |
| 53 | + def paragraphs |
| 54 | + @doc.xpath('//w:document//w:body//w:p').map { |p_node| parse_paragraph_from p_node } |
| 55 | + end |
| 56 | + |
| 57 | + def bookmarks |
| 58 | + bkmrks_hsh = Hash.new |
| 59 | + bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node } |
| 60 | + # auto-generated by office 2010 |
| 61 | + bkmrks_ary.reject! {|b| b.name == "_GoBack" } |
| 62 | + bkmrks_ary.each {|b| bkmrks_hsh[b.name] = b } |
| 63 | + bkmrks_hsh |
| 64 | + end |
| 65 | + |
| 66 | + def tables |
| 67 | + @doc.xpath('//w:document//w:body//w:tbl').map { |t_node| parse_table_from t_node } |
| 68 | + end |
| 69 | + |
| 70 | + # Some documents have this set, others don't. |
| 71 | + # Values are returned as half-points, so to get points, that's why it's divided by 2. |
| 72 | + def font_size |
| 73 | + size_tag = @styles.xpath('//w:docDefaults//w:rPrDefault//w:rPr//w:sz').first |
| 74 | + size_tag ? size_tag.attributes['val'].value.to_i / 2 : nil |
| 75 | + end |
| 76 | + |
| 77 | + ## |
| 78 | + # *Deprecated* |
| 79 | + # |
| 80 | + # Iterates over paragraphs within document |
| 81 | + # call-seq: |
| 82 | + # each_paragraph => Enumerator |
| 83 | + def each_paragraph |
| 84 | + paragraphs.each { |p| yield(p) } |
| 85 | + end |
| 86 | + |
| 87 | + # call-seq: |
| 88 | + # to_s -> string |
| 89 | + def to_s |
| 90 | + paragraphs.map(&:to_s).join("\n") |
| 91 | + end |
| 92 | + |
| 93 | + # Output entire document as a String HTML fragment |
| 94 | + def to_html |
| 95 | + paragraphs.map(&:to_html).join('\n') |
| 96 | + end |
| 97 | + |
| 98 | + # Save document to provided path |
| 99 | + # call-seq: |
| 100 | + # save(filepath) => void |
| 101 | + def save(path) |
| 102 | + update |
| 103 | + Zip::OutputStream.open(path) do |out| |
| 104 | + zip.each do |entry| |
| 105 | + out.put_next_entry(entry.name) |
| 106 | + |
| 107 | + if @replace[entry.name] |
| 108 | + out.write(@replace[entry.name]) |
| 109 | + else |
| 110 | + out.write(zip.read(entry.name)) |
| 111 | + end |
| 112 | + end |
| 113 | + end |
| 114 | + zip.close |
| 115 | + end |
| 116 | + |
| 117 | + alias_method :text, :to_s |
| 118 | + |
| 119 | + def replace_entry(entry_path, file_contents) |
| 120 | + @replace[entry_path] = file_contents |
| 121 | + end |
| 122 | + |
| 123 | + private |
| 124 | + |
| 125 | + #-- |
| 126 | + # TODO: Flesh this out to be compatible with other files |
| 127 | + # TODO: Method to set flag on files that have been edited, probably by inserting something at the |
| 128 | + # end of methods that make edits? |
| 129 | + #++ |
| 130 | + def update |
| 131 | + replace_entry "word/document.xml", doc.serialize(:save_with => 0) |
| 132 | + end |
| 133 | + |
| 134 | + # generate Elements::Containers::Paragraph from paragraph XML node |
| 135 | + def parse_paragraph_from(p_node) |
| 136 | + Elements::Containers::Paragraph.new(p_node, document_properties) |
| 137 | + end |
| 138 | + |
| 139 | + # generate Elements::Bookmark from bookmark XML node |
| 140 | + def parse_bookmark_from(b_node) |
| 141 | + Elements::Bookmark.new(b_node) |
| 142 | + end |
| 143 | + |
| 144 | + def parse_table_from(t_node) |
| 145 | + Elements::Containers::Table.new(t_node) |
| 146 | + end |
| 147 | + end |
| 148 | +end |
0 commit comments