Skip to content

Commit 619077d

Browse files
authored
4x-reg-sax-fixes (#2773)
1 parent 5bd8fbb commit 619077d

5 files changed

Lines changed: 90 additions & 16 deletions

File tree

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,25 @@ public MediaType detect(ZipFile zipFile, TikaInputStream stream) throws IOExcept
281281
//no need to close zipEntrySource because it
282282
//only closes the underlying zipFile, not any other resources
283283
//as of this writing.... :'(
284-
return null;
284+
//fall through to [Content_Types].xml fallback below
285+
}
286+
// POI may have failed (caught above) OR returned null because the
287+
// rels were malformed and POI silently produced an empty relationship
288+
// collection. Either way, fall back to parsing [Content_Types].xml
289+
// directly — same approach as the streaming detector.
290+
if (type == null) {
291+
ZipArchiveEntry ctEntry = zipEntrySource.getEntry("[Content_Types].xml");
292+
if (ctEntry != null) {
293+
try (InputStream contentTypesStream =
294+
zipEntrySource.getInputStream(ctEntry)) {
295+
type = parseOOXMLContentTypes(contentTypesStream);
296+
} catch (IOException ignore) {
297+
//swallow
298+
}
299+
}
300+
if (type == null || pkg == null) {
301+
return type;
302+
}
285303
}
286304
//this will now be closed eventually when the wrapper closes
287305
//the pkg which will close this

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.xml.sax.helpers.DefaultHandler;
3939

4040
import org.apache.tika.exception.TikaException;
41+
import org.apache.tika.exception.WriteLimitReachedException;
4142
import org.apache.tika.io.TikaInputStream;
4243
import org.apache.tika.metadata.Metadata;
4344
import org.apache.tika.metadata.Office;
@@ -376,6 +377,10 @@ private OOXMLTikaBodyPartHandler handlePart(PackagePart packagePart,
376377
linkedRelationships, config.isIncludeShapeBasedContent(),
377378
config.isConcatenatePhoneticRuns(),
378379
config.isPreferAlternateContentChoice())), context);
380+
} catch (SAXException e) {
381+
WriteLimitReachedException.throwIfWriteLimitReached(e);
382+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
383+
ExceptionUtils.getStackTrace(e));
379384
} catch (TikaException | IOException e) {
380385
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
381386
ExceptionUtils.getStackTrace(e));

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ private void outputCell() {
299299
break;
300300
case SST_STRING:
301301
String sstIndex = value.toString().trim();
302-
if (!sstIndex.isEmpty()) {
302+
if (!sstIndex.isEmpty() && sharedStringsShim != null) {
303303
try {
304304
int idx = Integer.parseInt(sstIndex);
305305
thisStr = sharedStringsShim.getItemAt(idx);

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,15 @@
5151

5252
import org.apache.tika.exception.RuntimeSAXException;
5353
import org.apache.tika.exception.TikaException;
54+
import org.apache.tika.exception.WriteLimitReachedException;
5455
import org.apache.tika.metadata.Metadata;
5556
import org.apache.tika.metadata.Office;
5657
import org.apache.tika.metadata.TikaCoreProperties;
5758
import org.apache.tika.parser.ParseContext;
5859
import org.apache.tika.parser.microsoft.OfficeParserConfig;
5960
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
6061
import org.apache.tika.sax.XHTMLContentHandler;
62+
import org.apache.tika.utils.ExceptionUtils;
6163
import org.apache.tika.utils.StringUtils;
6264
import org.apache.tika.utils.XMLReaderUtils;
6365

@@ -142,24 +144,56 @@ protected void buildXHTML(XHTMLContentHandler xhtml)
142144
throws SAXException, IOException {
143145
OPCPackage container = opcPackage;
144146

145-
XSSFSharedStringsShim stringsShim;
147+
XSSFSharedStringsShim stringsShim = null;
146148
XSSFReader.SheetIterator iter;
147149
XSSFReader xssfReader;
148-
XSSFStylesShim stylesShim;
150+
XSSFStylesShim stylesShim = null;
149151
try {
150152
xssfReader = new XSSFReader(container);
151-
stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), parseContext);
152-
153153
iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
154+
} catch (OpenXML4JException | RuntimeException e) {
155+
throw new IOException(e);
156+
}
157+
// Styles and shared strings are optional — if either part is missing or
158+
// unreadable, log to metadata and continue with degraded extraction.
159+
try {
160+
stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), parseContext);
161+
} catch (Exception e) {
162+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
163+
ExceptionUtils.getStackTrace(e));
164+
}
165+
try {
154166
stringsShim = new XSSFSharedStringsShim(xssfReader.getSharedStringsData(),
155167
config.isConcatenatePhoneticRuns(), parseContext);
156-
} catch (OpenXML4JException | TikaException e) {
157-
throw new IOException(e);
168+
} catch (Exception e) {
169+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
170+
ExceptionUtils.getStackTrace(e));
158171
}
159-
while (iter.hasNext()) {
172+
while (true) {
173+
try {
174+
if (!iter.hasNext()) {
175+
break;
176+
}
177+
} catch (RuntimeException e) {
178+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
179+
ExceptionUtils.getStackTrace(e));
180+
break;
181+
}
160182
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml);
161183
PackagePart sheetPart = null;
162-
try (InputStream stream = iter.next()) {
184+
InputStream nextStream;
185+
try {
186+
nextStream = iter.next();
187+
} catch (RuntimeException e) {
188+
// POI can throw POIXMLException for missing sheet parts (e.g.,
189+
// truncated workbook references a sheet that isn't in the zip).
190+
// Break rather than continue — POI's iterator state may not have
191+
// advanced, which would cause an infinite loop.
192+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
193+
ExceptionUtils.getStackTrace(e));
194+
break;
195+
}
196+
try (InputStream stream = nextStream) {
163197
sheetPart = iter.getSheetPart();
164198

165199
addDrawingHyperLinks(sheetPart);
@@ -178,7 +212,15 @@ protected void buildXHTML(XHTMLContentHandler xhtml)
178212
xhtml.startElement("table");
179213
xhtml.startElement("tbody");
180214

181-
processSheet(sheetExtractor, commentsShim, stylesShim, stringsShim, stream);
215+
try {
216+
processSheet(sheetExtractor, commentsShim, stylesShim, stringsShim, stream);
217+
} catch (SAXException e) {
218+
// Truncated/malformed sheet XML — keep prior sheets and
219+
// record the failure as a warning.
220+
WriteLimitReachedException.throwIfWriteLimitReached(e);
221+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
222+
ExceptionUtils.getStackTrace(e));
223+
}
182224
try {
183225
getThreadedComments(container, sheetPart, xhtml);
184226
} catch (InvalidFormatException | TikaException | IOException e) {

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
import org.apache.tika.mime.MediaType;
6060
import org.apache.tika.parser.ParseContext;
6161
import org.apache.tika.sax.XHTMLContentHandler;
62+
import org.apache.tika.utils.ParserUtils;
6263

6364
/**
6465
* Parser for ZIP and JAR archives using file-based access for complete metadata extraction.
@@ -374,12 +375,22 @@ private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata,
374375
throws TikaException, IOException, SAXException {
375376

376377
try {
377-
ArchiveEntry entry = zis.getNextEntry();
378-
while (entry != null) {
378+
ArchiveEntry entry;
379+
while (true) {
380+
try {
381+
entry = zis.getNextEntry();
382+
} catch (java.util.zip.ZipException ze) {
383+
// Truncated/corrupt central directory: stop iteration but keep
384+
// entries already extracted. Record the failure as a warning.
385+
ParserUtils.recordParserFailure(this, ze, metadata);
386+
break;
387+
}
388+
if (entry == null) {
389+
break;
390+
}
379391
if (shouldUseDataDescriptor && entryCnt.get() > 0) {
380392
// Skip already-processed entries on re-read
381393
entryCnt.decrementAndGet();
382-
entry = zis.getNextEntry();
383394
continue;
384395
}
385396

@@ -405,8 +416,6 @@ private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata,
405416
if (!shouldUseDataDescriptor) {
406417
entryCnt.incrementAndGet();
407418
}
408-
409-
entry = zis.getNextEntry();
410419
}
411420
} catch (UnsupportedZipFeatureException zfe) {
412421
if (zfe.getFeature() == Feature.ENCRYPTION) {

0 commit comments

Comments
 (0)