Skip to content

Commit 5bd8fbb

Browse files
authored
update epub along the lines of oodt (#2774)
1 parent 48e4ecc commit 5bd8fbb

2 files changed

Lines changed: 14 additions & 126 deletions

File tree

  • tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java

Lines changed: 10 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import java.io.InputStream;
2323
import java.io.UnsupportedEncodingException;
2424
import java.net.URLDecoder;
25-
import java.nio.file.Path;
2625
import java.util.ArrayList;
2726
import java.util.Arrays;
2827
import java.util.Collections;
@@ -35,10 +34,8 @@
3534
import java.util.Set;
3635

3736
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
38-
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
3937
import org.apache.commons.compress.archivers.zip.ZipFile;
4038
import org.apache.commons.io.IOUtils;
41-
import org.apache.commons.io.input.CloseShieldInputStream;
4239
import org.apache.commons.lang3.StringUtils;
4340
import org.xml.sax.Attributes;
4441
import org.xml.sax.ContentHandler;
@@ -52,8 +49,6 @@
5249
import org.apache.tika.exception.WriteLimitReachedException;
5350
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
5451
import org.apache.tika.extractor.EmbeddedDocumentUtil;
55-
import org.apache.tika.io.FilenameUtils;
56-
import org.apache.tika.io.TemporaryResources;
5752
import org.apache.tika.io.TikaInputStream;
5853
import org.apache.tika.metadata.Metadata;
5954
import org.apache.tika.metadata.TikaCoreProperties;
@@ -65,9 +60,7 @@
6560
import org.apache.tika.sax.ContentHandlerDecorator;
6661
import org.apache.tika.sax.EmbeddedContentHandler;
6762
import org.apache.tika.sax.XHTMLContentHandler;
68-
import org.apache.tika.utils.ParserUtils;
6963
import org.apache.tika.utils.XMLReaderUtils;
70-
import org.apache.tika.zip.utils.ZipSalvager;
7164

7265
/**
7366
* Epub parser
@@ -146,54 +139,28 @@ private void updateMimeType(InputStream is, Metadata metadata) throws IOExceptio
146139
private Set<String> bufferedParse(TikaInputStream tis, ContentHandler bodyHandler,
147140
XHTMLContentHandler xhtml, Metadata metadata, ParseContext context)
148141
throws IOException, TikaException, SAXException {
142+
// DefaultZipContainerDetector opens (and salvages, if needed) the ZipFile and
143+
// stashes it on the TikaInputStream. Reuse it when present; otherwise open ourselves.
149144
if (tis.getOpenContainer() instanceof ZipFile) {
150-
return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata, context, true);
145+
return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata, context);
151146
}
152-
ZipFile zipFile = null;
153-
try {
154-
zipFile = ZipFile.builder().setFile(tis.getPath().toFile()).get();
155-
} catch (IOException e) {
156-
ParserUtils.recordParserFailure(this, e, metadata);
157-
return trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
158-
}
159-
160-
try {
161-
return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
162-
} finally {
163-
zipFile.close();
164-
}
165-
}
166-
167-
private Set<String> trySalvage(Path brokenZip, ContentHandler bodyHandler,
168-
XHTMLContentHandler xhtml,
169-
Metadata metadata, ParseContext context)
170-
throws IOException, TikaException, SAXException {
171-
try (TemporaryResources resources = new TemporaryResources()) {
172-
Path salvaged =
173-
resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString()));
174-
ZipSalvager.salvageCopy(brokenZip, salvaged);
175-
try (ZipFile zipFile = ZipFile.builder().setFile(salvaged.toFile()).get()) {
176-
return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
177-
} catch (EpubZipException e) {
178-
try (TikaInputStream tis = TikaInputStream.get(salvaged)) {
179-
return streamingParse(tis, xhtml, metadata, context);
180-
}
181-
}
147+
try (ZipFile zipFile = ZipFile.builder().setFile(tis.getPath().toFile()).get()) {
148+
return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context);
182149
}
183150
}
184151

185152
private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler,
186153
XHTMLContentHandler xhtml, Metadata metadata,
187-
ParseContext context, boolean isStrict)
188-
throws IOException, TikaException, SAXException, EpubZipException {
154+
ParseContext context)
155+
throws IOException, TikaException, SAXException {
189156

190157
String rootOPF = getRoot(zipFile, context);
191158
if (rootOPF == null) {
192-
throw new EpubZipException();
159+
return Collections.EMPTY_SET;
193160
}
194161
ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
195162
if (zae == null || !zipFile.canReadEntryData(zae)) {
196-
throw new EpubZipException();
163+
return Collections.EMPTY_SET;
197164
}
198165
try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) {
199166
opf.parse(tis, new DefaultHandler(), metadata, context);
@@ -203,33 +170,14 @@ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan
203170
try (InputStream is = zipFile.getInputStream(zae)) {
204171
XMLReaderUtils.parseSAX(is, contentOrderScraper, context);
205172
}
206-
//if no content items, false
207173
if (contentOrderScraper.contentItems.isEmpty()) {
208-
throw new EpubZipException();
174+
return Collections.EMPTY_SET;
209175
}
210176
String relativePath = "";
211177
if (rootOPF.lastIndexOf("/") > -1) {
212178
relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
213179
}
214180

215-
if (isStrict) {
216-
int found = 0;
217-
for (String id : contentOrderScraper.contentItems) {
218-
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
219-
if (hRefMediaPair != null && hRefMediaPair.href != null) {
220-
zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
221-
if (zae != null && zipFile.canReadEntryData(zae)) {
222-
found++;
223-
}
224-
}
225-
}
226-
//if not perfect match btwn items and readable items
227-
//return false
228-
if (found != contentOrderScraper.contentItems.size()) {
229-
throw new EpubZipException();
230-
}
231-
}
232-
233181
extractMetadata(zipFile, metadata, context);
234182
Set<String> encryptedItems = checkForDRM(zipFile);
235183
Set<String> processed = new HashSet<>();
@@ -306,12 +254,6 @@ private Set<String> checkForDRM(ZipFile zipFile) throws IOException, TikaExcepti
306254
}
307255
}
308256

309-
private void checkForDRM(InputStream is, ParseContext parseContext)
310-
throws IOException, TikaException, SAXException {
311-
Set<String> encryptedItems = EncryptionHandler.parse(is, parseContext);
312-
maybeThrowEncryptedException(encryptedItems);
313-
}
314-
315257
private void maybeThrowEncryptedException(Set<String> encryptedItems)
316258
throws EncryptedDocumentException {
317259
if (encryptedItems.size() == 0) {
@@ -437,55 +379,6 @@ private String getRoot(ZipFile zipFile, ParseContext context)
437379
}
438380
}
439381

440-
//should only be used as a last resort on a truncated zip
441-
private Set<String> streamingParse(InputStream stream, ContentHandler bodyHandler,
442-
Metadata metadata,
443-
ParseContext context)
444-
throws IOException, TikaException, SAXException {
445-
ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false);
446-
447-
ZipArchiveEntry entry = zip.getNextEntry();
448-
SAXException sax = null;
449-
while (entry != null) {
450-
if (entry.getName().equals("mimetype")) {
451-
updateMimeType(zip, metadata);
452-
} else if (entry.getName().equals(META_INF_ENCRYPTION)) {
453-
//when streaming, throw an encryption exception if anything is encrypted
454-
checkForDRM(zip, context);
455-
} else if (entry.getName().equals("metadata.xml")) {
456-
try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) {
457-
meta.parse(tisZip, new DefaultHandler(), metadata, context);
458-
}
459-
} else if (entry.getName().endsWith(".opf")) {
460-
try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) {
461-
opf.parse(tisZip, new DefaultHandler(), metadata, context);
462-
}
463-
} else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") ||
464-
entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) {
465-
try {
466-
try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) {
467-
content.parse(tisZip, bodyHandler, metadata, context);
468-
}
469-
} catch (SAXException e) {
470-
if (WriteLimitReachedException.isWriteLimitReached(e)) {
471-
throw e;
472-
}
473-
if (sax == null) {
474-
sax = e;
475-
}
476-
}
477-
}
478-
entry = zip.getNextEntry();
479-
}
480-
if (sax != null) {
481-
throw sax;
482-
}
483-
//always empty -- we throw an encryption exception
484-
//as soon as checkForDRM hits an encrypted item
485-
return Collections.EMPTY_SET;
486-
}
487-
488-
489382
private static class RootFinder extends DefaultHandler {
490383
String root = null;
491384

@@ -586,12 +479,6 @@ public Set<String> getEncryptedItems() {
586479
}
587480
}
588481

589-
//any problem with parsing an epub file when it is
590-
//a zip file
591-
private static class EpubZipException extends IOException {
592-
593-
}
594-
595482
//for now, this simply converts all names to local names to avoid
596483
//namespace conflicts in the content handler. This also removes namespaces
597484
//from attributes

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import org.apache.tika.metadata.Metadata;
3434
import org.apache.tika.metadata.Property;
3535
import org.apache.tika.metadata.TikaCoreProperties;
36-
import org.apache.tika.parser.Parser;
3736

3837
public class EpubParserTest extends TikaTest {
3938

@@ -88,10 +87,12 @@ public void testEpubOrder() throws Exception {
8887

8988
@Test
9089
public void testTruncated() throws Exception {
91-
Parser p = new EpubParser();
90+
// Truncated zips are salvaged by DefaultZipContainerDetector and the
91+
// recovered ZipFile is handed to EpubParser via openContainer.
92+
// EpubParser itself no longer salvages — it relies on the detector.
9293
List<Metadata> metadataList;
9394
try (TikaInputStream tis = truncate("testEPUB.epub", 10000)) {
94-
metadataList = getRecursiveMetadata(tis, p, true);
95+
metadataList = getRecursiveMetadata(tis, true);
9596
}
9697
String xml = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
9798
int ch1 = xml.indexOf("<h1>Chapter 1");

0 commit comments

Comments
 (0)