This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 619077d4e4 4x-reg-sax-fixes (#2773)
619077d4e4 is described below
commit 619077d4e40b70d9634d0213dc2ddd92cc61e3a4
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 17 14:00:44 2026 -0400
4x-reg-sax-fixes (#2773)
---
.../detect/microsoft/ooxml/OPCPackageDetector.java | 20 +++++++-
.../ooxml/SXWPFWordExtractorDecorator.java | 5 ++
.../microsoft/ooxml/TikaSheetXMLHandler.java | 2 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 60 ++++++++++++++++++----
.../java/org/apache/tika/parser/pkg/ZipParser.java | 19 +++++--
5 files changed, 90 insertions(+), 16 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
index 52108e8885..8378d9b19c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
@@ -281,7 +281,25 @@ public class OPCPackageDetector implements
ZipContainerDetector {
//no need to close zipEntrySource because it
//only closes the underlying zipFile, not any other resources
//as of this writing.... :'(
- return null;
+ //fall through to [Content_Types].xml fallback below
+ }
+ // POI may have failed (caught above) OR returned null because the
+ // rels were malformed and POI silently produced an empty relationship
+ // collection. Either way, fall back to parsing [Content_Types].xml
+ // directly — same approach as the streaming detector.
+ if (type == null) {
+ ZipArchiveEntry ctEntry =
zipEntrySource.getEntry("[Content_Types].xml");
+ if (ctEntry != null) {
+ try (InputStream contentTypesStream =
+ zipEntrySource.getInputStream(ctEntry)) {
+ type = parseOOXMLContentTypes(contentTypesStream);
+ } catch (IOException ignore) {
+ //swallow
+ }
+ }
+ if (type == null || pkg == null) {
+ return type;
+ }
}
//this will now be closed eventually when the wrapper closes
//the pkg which will close this
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index d990e6f96a..de3ff51835 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -38,6 +38,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -376,6 +377,10 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
linkedRelationships,
config.isIncludeShapeBasedContent(),
config.isConcatenatePhoneticRuns(),
config.isPreferAlternateContentChoice())),
context);
+ } catch (SAXException e) {
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
} catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
index 3ba83dd255..ec96a40c2e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
@@ -299,7 +299,7 @@ class TikaSheetXMLHandler extends DefaultHandler {
break;
case SST_STRING:
String sstIndex = value.toString().trim();
- if (!sstIndex.isEmpty()) {
+ if (!sstIndex.isEmpty() && sharedStringsShim != null) {
try {
int idx = Integer.parseInt(sstIndex);
thisStr = sharedStringsShim.getItemAt(idx);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 10f4c1daf5..d968cdb856 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -51,6 +51,7 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -58,6 +59,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
@@ -142,24 +144,56 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
throws SAXException, IOException {
OPCPackage container = opcPackage;
- XSSFSharedStringsShim stringsShim;
+ XSSFSharedStringsShim stringsShim = null;
XSSFReader.SheetIterator iter;
XSSFReader xssfReader;
- XSSFStylesShim stylesShim;
+ XSSFStylesShim stylesShim = null;
try {
xssfReader = new XSSFReader(container);
- stylesShim = new XSSFStylesShim(xssfReader.getStylesData(),
parseContext);
-
iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+ } catch (OpenXML4JException | RuntimeException e) {
+ throw new IOException(e);
+ }
+ // Styles and shared strings are optional — if either part is missing
or
+ // unreadable, log to metadata and continue with degraded extraction.
+ try {
+ stylesShim = new XSSFStylesShim(xssfReader.getStylesData(),
parseContext);
+ } catch (Exception e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ }
+ try {
stringsShim = new
XSSFSharedStringsShim(xssfReader.getSharedStringsData(),
config.isConcatenatePhoneticRuns(), parseContext);
- } catch (OpenXML4JException | TikaException e) {
- throw new IOException(e);
+ } catch (Exception e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
}
- while (iter.hasNext()) {
+ while (true) {
+ try {
+ if (!iter.hasNext()) {
+ break;
+ }
+ } catch (RuntimeException e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ break;
+ }
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config,
xhtml);
PackagePart sheetPart = null;
- try (InputStream stream = iter.next()) {
+ InputStream nextStream;
+ try {
+ nextStream = iter.next();
+ } catch (RuntimeException e) {
+ // POI can throw POIXMLException for missing sheet parts (e.g.,
+ // truncated workbook references a sheet that isn't in the
zip).
+ // Break rather than continue — POI's iterator state may not
have
+ // advanced, which would cause an infinite loop.
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ break;
+ }
+ try (InputStream stream = nextStream) {
sheetPart = iter.getSheetPart();
addDrawingHyperLinks(sheetPart);
@@ -178,7 +212,15 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.startElement("table");
xhtml.startElement("tbody");
- processSheet(sheetExtractor, commentsShim, stylesShim,
stringsShim, stream);
+ try {
+ processSheet(sheetExtractor, commentsShim, stylesShim,
stringsShim, stream);
+ } catch (SAXException e) {
+ // Truncated/malformed sheet XML — keep prior sheets and
+ // record the failure as a warning.
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ }
try {
getThreadedComments(container, sheetPart, xhtml);
} catch (InvalidFormatException | TikaException | IOException
e) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index 698bd26991..eec152cb7e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -59,6 +59,7 @@ import org.apache.tika.metadata.Zip;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ParserUtils;
/**
* Parser for ZIP and JAR archives using file-based access for complete
metadata extraction.
@@ -374,12 +375,22 @@ public class ZipParser extends AbstractArchiveParser {
throws TikaException, IOException, SAXException {
try {
- ArchiveEntry entry = zis.getNextEntry();
- while (entry != null) {
+ ArchiveEntry entry;
+ while (true) {
+ try {
+ entry = zis.getNextEntry();
+ } catch (java.util.zip.ZipException ze) {
+ // Truncated/corrupt central directory: stop iteration but
keep
+ // entries already extracted. Record the failure as a
warning.
+ ParserUtils.recordParserFailure(this, ze, metadata);
+ break;
+ }
+ if (entry == null) {
+ break;
+ }
if (shouldUseDataDescriptor && entryCnt.get() > 0) {
// Skip already-processed entries on re-read
entryCnt.decrementAndGet();
- entry = zis.getNextEntry();
continue;
}
@@ -405,8 +416,6 @@ public class ZipParser extends AbstractArchiveParser {
if (!shouldUseDataDescriptor) {
entryCnt.incrementAndGet();
}
-
- entry = zis.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
if (zfe.getFeature() == Feature.ENCRYPTION) {