2222import java .io .InputStream ;
2323import java .io .UnsupportedEncodingException ;
2424import java .net .URLDecoder ;
25- import java .nio .file .Path ;
2625import java .util .ArrayList ;
2726import java .util .Arrays ;
2827import java .util .Collections ;
3534import java .util .Set ;
3635
3736import org .apache .commons .compress .archivers .zip .ZipArchiveEntry ;
38- import org .apache .commons .compress .archivers .zip .ZipArchiveInputStream ;
3937import org .apache .commons .compress .archivers .zip .ZipFile ;
4038import org .apache .commons .io .IOUtils ;
41- import org .apache .commons .io .input .CloseShieldInputStream ;
4239import org .apache .commons .lang3 .StringUtils ;
4340import org .xml .sax .Attributes ;
4441import org .xml .sax .ContentHandler ;
5249import org .apache .tika .exception .WriteLimitReachedException ;
5350import org .apache .tika .extractor .EmbeddedDocumentExtractor ;
5451import org .apache .tika .extractor .EmbeddedDocumentUtil ;
55- import org .apache .tika .io .FilenameUtils ;
56- import org .apache .tika .io .TemporaryResources ;
5752import org .apache .tika .io .TikaInputStream ;
5853import org .apache .tika .metadata .Metadata ;
5954import org .apache .tika .metadata .TikaCoreProperties ;
6560import org .apache .tika .sax .ContentHandlerDecorator ;
6661import org .apache .tika .sax .EmbeddedContentHandler ;
6762import org .apache .tika .sax .XHTMLContentHandler ;
68- import org .apache .tika .utils .ParserUtils ;
6963import org .apache .tika .utils .XMLReaderUtils ;
70- import org .apache .tika .zip .utils .ZipSalvager ;
7164
7265/**
7366 * Epub parser
@@ -146,54 +139,28 @@ private void updateMimeType(InputStream is, Metadata metadata) throws IOExceptio
146139 private Set <String > bufferedParse (TikaInputStream tis , ContentHandler bodyHandler ,
147140 XHTMLContentHandler xhtml , Metadata metadata , ParseContext context )
148141 throws IOException , TikaException , SAXException {
142+ // DefaultZipContainerDetector opens (and salvages, if needed) the ZipFile and
143+ // stashes it on the TikaInputStream. Reuse it when present; otherwise open ourselves.
149144 if (tis .getOpenContainer () instanceof ZipFile ) {
150- return bufferedParseZipFile ((ZipFile ) tis .getOpenContainer (), bodyHandler , xhtml , metadata , context , true );
145+ return bufferedParseZipFile ((ZipFile ) tis .getOpenContainer (), bodyHandler , xhtml , metadata , context );
151146 }
152- ZipFile zipFile = null ;
153- try {
154- zipFile = ZipFile .builder ().setFile (tis .getPath ().toFile ()).get ();
155- } catch (IOException e ) {
156- ParserUtils .recordParserFailure (this , e , metadata );
157- return trySalvage (tis .getPath (), bodyHandler , xhtml , metadata , context );
158- }
159-
160- try {
161- return bufferedParseZipFile (zipFile , bodyHandler , xhtml , metadata , context , true );
162- } finally {
163- zipFile .close ();
164- }
165- }
166-
167- private Set <String > trySalvage (Path brokenZip , ContentHandler bodyHandler ,
168- XHTMLContentHandler xhtml ,
169- Metadata metadata , ParseContext context )
170- throws IOException , TikaException , SAXException {
171- try (TemporaryResources resources = new TemporaryResources ()) {
172- Path salvaged =
173- resources .createTempFile (FilenameUtils .getSuffixFromPath (brokenZip .getFileName ().toString ()));
174- ZipSalvager .salvageCopy (brokenZip , salvaged );
175- try (ZipFile zipFile = ZipFile .builder ().setFile (salvaged .toFile ()).get ()) {
176- return bufferedParseZipFile (zipFile , bodyHandler , xhtml , metadata , context , false );
177- } catch (EpubZipException e ) {
178- try (TikaInputStream tis = TikaInputStream .get (salvaged )) {
179- return streamingParse (tis , xhtml , metadata , context );
180- }
181- }
147+ try (ZipFile zipFile = ZipFile .builder ().setFile (tis .getPath ().toFile ()).get ()) {
148+ return bufferedParseZipFile (zipFile , bodyHandler , xhtml , metadata , context );
182149 }
183150 }
184151
185152 private Set <String > bufferedParseZipFile (ZipFile zipFile , ContentHandler bodyHandler ,
186153 XHTMLContentHandler xhtml , Metadata metadata ,
187- ParseContext context , boolean isStrict )
188- throws IOException , TikaException , SAXException , EpubZipException {
154+ ParseContext context )
155+ throws IOException , TikaException , SAXException {
189156
190157 String rootOPF = getRoot (zipFile , context );
191158 if (rootOPF == null ) {
192- throw new EpubZipException () ;
159+ return Collections . EMPTY_SET ;
193160 }
194161 ZipArchiveEntry zae = zipFile .getEntry (rootOPF );
195162 if (zae == null || !zipFile .canReadEntryData (zae )) {
196- throw new EpubZipException () ;
163+ return Collections . EMPTY_SET ;
197164 }
198165 try (TikaInputStream tis = TikaInputStream .get (zipFile .getInputStream (zae ))) {
199166 opf .parse (tis , new DefaultHandler (), metadata , context );
@@ -203,33 +170,14 @@ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan
203170 try (InputStream is = zipFile .getInputStream (zae )) {
204171 XMLReaderUtils .parseSAX (is , contentOrderScraper , context );
205172 }
206- //if no content items, false
207173 if (contentOrderScraper .contentItems .isEmpty ()) {
208- throw new EpubZipException () ;
174+ return Collections . EMPTY_SET ;
209175 }
210176 String relativePath = "" ;
211177 if (rootOPF .lastIndexOf ("/" ) > -1 ) {
212178 relativePath = rootOPF .substring (0 , rootOPF .lastIndexOf ("/" ) + 1 );
213179 }
214180
215- if (isStrict ) {
216- int found = 0 ;
217- for (String id : contentOrderScraper .contentItems ) {
218- HRefMediaPair hRefMediaPair = contentOrderScraper .locationMap .get (id );
219- if (hRefMediaPair != null && hRefMediaPair .href != null ) {
220- zae = zipFile .getEntry (relativePath + hRefMediaPair .href );
221- if (zae != null && zipFile .canReadEntryData (zae )) {
222- found ++;
223- }
224- }
225- }
226- //if not perfect match btwn items and readable items
227- //return false
228- if (found != contentOrderScraper .contentItems .size ()) {
229- throw new EpubZipException ();
230- }
231- }
232-
233181 extractMetadata (zipFile , metadata , context );
234182 Set <String > encryptedItems = checkForDRM (zipFile );
235183 Set <String > processed = new HashSet <>();
@@ -306,12 +254,6 @@ private Set<String> checkForDRM(ZipFile zipFile) throws IOException, TikaExcepti
306254 }
307255 }
308256
309- private void checkForDRM (InputStream is , ParseContext parseContext )
310- throws IOException , TikaException , SAXException {
311- Set <String > encryptedItems = EncryptionHandler .parse (is , parseContext );
312- maybeThrowEncryptedException (encryptedItems );
313- }
314-
315257 private void maybeThrowEncryptedException (Set <String > encryptedItems )
316258 throws EncryptedDocumentException {
317259 if (encryptedItems .size () == 0 ) {
@@ -437,55 +379,6 @@ private String getRoot(ZipFile zipFile, ParseContext context)
437379 }
438380 }
439381
440- //should only be used as a last resort on a truncated zip
441- private Set <String > streamingParse (InputStream stream , ContentHandler bodyHandler ,
442- Metadata metadata ,
443- ParseContext context )
444- throws IOException , TikaException , SAXException {
445- ZipArchiveInputStream zip = new ZipArchiveInputStream (stream , "UTF-8" , false , true , false );
446-
447- ZipArchiveEntry entry = zip .getNextEntry ();
448- SAXException sax = null ;
449- while (entry != null ) {
450- if (entry .getName ().equals ("mimetype" )) {
451- updateMimeType (zip , metadata );
452- } else if (entry .getName ().equals (META_INF_ENCRYPTION )) {
453- //when streaming, throw an encryption exception if anything is encrypted
454- checkForDRM (zip , context );
455- } else if (entry .getName ().equals ("metadata.xml" )) {
456- try (TikaInputStream tisZip = TikaInputStream .get (CloseShieldInputStream .wrap (zip ))) {
457- meta .parse (tisZip , new DefaultHandler (), metadata , context );
458- }
459- } else if (entry .getName ().endsWith (".opf" )) {
460- try (TikaInputStream tisZip = TikaInputStream .get (CloseShieldInputStream .wrap (zip ))) {
461- opf .parse (tisZip , new DefaultHandler (), metadata , context );
462- }
463- } else if (entry .getName ().endsWith (".htm" ) || entry .getName ().endsWith (".html" ) ||
464- entry .getName ().endsWith (".xhtml" ) || entry .getName ().endsWith (".xml" )) {
465- try {
466- try (TikaInputStream tisZip = TikaInputStream .get (CloseShieldInputStream .wrap (zip ))) {
467- content .parse (tisZip , bodyHandler , metadata , context );
468- }
469- } catch (SAXException e ) {
470- if (WriteLimitReachedException .isWriteLimitReached (e )) {
471- throw e ;
472- }
473- if (sax == null ) {
474- sax = e ;
475- }
476- }
477- }
478- entry = zip .getNextEntry ();
479- }
480- if (sax != null ) {
481- throw sax ;
482- }
483- //always empty -- we throw an encryption exception
484- //as soon as checkForDRM hits an encrypted item
485- return Collections .EMPTY_SET ;
486- }
487-
488-
489382 private static class RootFinder extends DefaultHandler {
490383 String root = null ;
491384
@@ -586,12 +479,6 @@ public Set<String> getEncryptedItems() {
586479 }
587480 }
588481
589- //any problem with parsing an epub file when it is
590- //a zip file
591- private static class EpubZipException extends IOException {
592-
593- }
594-
595482 //for now, this simply converts all names to local names to avoid
596483 //namespace conflicts in the content handler. This also removes namespaces
597484 //from attributes
0 commit comments