This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 619077d4e4 4x-reg-sax-fixes (#2773)
619077d4e4 is described below

commit 619077d4e40b70d9634d0213dc2ddd92cc61e3a4
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 17 14:00:44 2026 -0400

    4x-reg-sax-fixes (#2773)
---
 .../detect/microsoft/ooxml/OPCPackageDetector.java | 20 +++++++-
 .../ooxml/SXWPFWordExtractorDecorator.java         |  5 ++
 .../microsoft/ooxml/TikaSheetXMLHandler.java       |  2 +-
 .../ooxml/XSSFExcelExtractorDecorator.java         | 60 ++++++++++++++++++----
 .../java/org/apache/tika/parser/pkg/ZipParser.java | 19 +++++--
 5 files changed, 90 insertions(+), 16 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
index 52108e8885..8378d9b19c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
@@ -281,7 +281,25 @@ public class OPCPackageDetector implements 
ZipContainerDetector {
             //no need to close zipEntrySource because it
             //only closes the underlying zipFile, not any other resources
             //as of this writing.... :'(
-            return null;
+            //fall through to [Content_Types].xml fallback below
+        }
+        // POI may have failed (caught above) OR returned null because the
+        // rels were malformed and POI silently produced an empty relationship
+        // collection. Either way, fall back to parsing [Content_Types].xml
+        // directly — same approach as the streaming detector.
+        if (type == null) {
+            ZipArchiveEntry ctEntry = 
zipEntrySource.getEntry("[Content_Types].xml");
+            if (ctEntry != null) {
+                try (InputStream contentTypesStream =
+                             zipEntrySource.getInputStream(ctEntry)) {
+                    type = parseOOXMLContentTypes(contentTypesStream);
+                } catch (IOException ignore) {
+                    //swallow
+                }
+            }
+            if (type == null || pkg == null) {
+                return type;
+            }
         }
         //this will now be closed eventually when the wrapper closes
         //the pkg which will close this
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index d990e6f96a..de3ff51835 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -38,6 +38,7 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -376,6 +377,10 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                             linkedRelationships, 
config.isIncludeShapeBasedContent(),
                             config.isConcatenatePhoneticRuns(),
                             config.isPreferAlternateContentChoice())), 
context);
+        } catch (SAXException e) {
+            WriteLimitReachedException.throwIfWriteLimitReached(e);
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                    ExceptionUtils.getStackTrace(e));
         } catch (TikaException | IOException e) {
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                     ExceptionUtils.getStackTrace(e));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
index 3ba83dd255..ec96a40c2e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
@@ -299,7 +299,7 @@ class TikaSheetXMLHandler extends DefaultHandler {
                     break;
                 case SST_STRING:
                     String sstIndex = value.toString().trim();
-                    if (!sstIndex.isEmpty()) {
+                    if (!sstIndex.isEmpty() && sharedStringsShim != null) {
                         try {
                             int idx = Integer.parseInt(sstIndex);
                             thisStr = sharedStringsShim.getItemAt(idx);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 10f4c1daf5..d968cdb856 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -51,6 +51,7 @@ import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -58,6 +59,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
 import org.apache.tika.utils.StringUtils;
 import org.apache.tika.utils.XMLReaderUtils;
 
@@ -142,24 +144,56 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             throws SAXException, IOException {
         OPCPackage container = opcPackage;
 
-        XSSFSharedStringsShim stringsShim;
+        XSSFSharedStringsShim stringsShim = null;
         XSSFReader.SheetIterator iter;
         XSSFReader xssfReader;
-        XSSFStylesShim stylesShim;
+        XSSFStylesShim stylesShim = null;
         try {
             xssfReader = new XSSFReader(container);
-            stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), 
parseContext);
-
             iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+        } catch (OpenXML4JException | RuntimeException e) {
+            throw new IOException(e);
+        }
+        // Styles and shared strings are optional — if either part is missing 
or
+        // unreadable, log to metadata and continue with degraded extraction.
+        try {
+            stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), 
parseContext);
+        } catch (Exception e) {
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                    ExceptionUtils.getStackTrace(e));
+        }
+        try {
             stringsShim = new 
XSSFSharedStringsShim(xssfReader.getSharedStringsData(),
                     config.isConcatenatePhoneticRuns(), parseContext);
-        } catch (OpenXML4JException | TikaException e) {
-            throw new IOException(e);
+        } catch (Exception e) {
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                    ExceptionUtils.getStackTrace(e));
         }
-        while (iter.hasNext()) {
+        while (true) {
+            try {
+                if (!iter.hasNext()) {
+                    break;
+                }
+            } catch (RuntimeException e) {
+                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                        ExceptionUtils.getStackTrace(e));
+                break;
+            }
             SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, 
xhtml);
             PackagePart sheetPart = null;
-            try (InputStream stream = iter.next()) {
+            InputStream nextStream;
+            try {
+                nextStream = iter.next();
+            } catch (RuntimeException e) {
+                // POI can throw POIXMLException for missing sheet parts (e.g.,
+                // truncated workbook references a sheet that isn't in the 
zip).
+                // Break rather than continue — POI's iterator state may not 
have
+                // advanced, which would cause an infinite loop.
+                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                        ExceptionUtils.getStackTrace(e));
+                break;
+            }
+            try (InputStream stream = nextStream) {
                 sheetPart = iter.getSheetPart();
 
                 addDrawingHyperLinks(sheetPart);
@@ -178,7 +212,15 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 xhtml.startElement("table");
                 xhtml.startElement("tbody");
 
-                processSheet(sheetExtractor, commentsShim, stylesShim, 
stringsShim, stream);
+                try {
+                    processSheet(sheetExtractor, commentsShim, stylesShim, 
stringsShim, stream);
+                } catch (SAXException e) {
+                    // Truncated/malformed sheet XML — keep prior sheets and
+                    // record the failure as a warning.
+                    WriteLimitReachedException.throwIfWriteLimitReached(e);
+                    
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+                            ExceptionUtils.getStackTrace(e));
+                }
                 try {
                     getThreadedComments(container, sheetPart, xhtml);
                 } catch (InvalidFormatException | TikaException | IOException 
e) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index 698bd26991..eec152cb7e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -59,6 +59,7 @@ import org.apache.tika.metadata.Zip;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ParserUtils;
 
 /**
  * Parser for ZIP and JAR archives using file-based access for complete 
metadata extraction.
@@ -374,12 +375,22 @@ public class ZipParser extends AbstractArchiveParser {
             throws TikaException, IOException, SAXException {
 
         try {
-            ArchiveEntry entry = zis.getNextEntry();
-            while (entry != null) {
+            ArchiveEntry entry;
+            while (true) {
+                try {
+                    entry = zis.getNextEntry();
+                } catch (java.util.zip.ZipException ze) {
+                    // Truncated/corrupt central directory: stop iteration but 
keep
+                    // entries already extracted. Record the failure as a 
warning.
+                    ParserUtils.recordParserFailure(this, ze, metadata);
+                    break;
+                }
+                if (entry == null) {
+                    break;
+                }
                 if (shouldUseDataDescriptor && entryCnt.get() > 0) {
                     // Skip already-processed entries on re-read
                     entryCnt.decrementAndGet();
-                    entry = zis.getNextEntry();
                     continue;
                 }
 
@@ -405,8 +416,6 @@ public class ZipParser extends AbstractArchiveParser {
                 if (!shouldUseDataDescriptor) {
                     entryCnt.incrementAndGet();
                 }
-
-                entry = zis.getNextEntry();
             }
         } catch (UnsupportedZipFeatureException zfe) {
             if (zfe.getFeature() == Feature.ENCRYPTION) {

Reply via email to