This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch 3x-ooxml-bigdecimal-dos
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c9f1b44c0f3931ce4d89500fa86678e4f6168fde
Author: tallison <[email protected]>
AuthorDate: Tue May 26 15:48:21 2026 -0400

    fix potential dos issues in ooxml properties files
---
 .../parser/microsoft/ooxml/MetadataExtractor.java  | 259 +++++++++++++++------
 1 file changed, 187 insertions(+), 72 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index 97efe3e186..25e49c31f3 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -16,17 +16,23 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.io.InputStream;
 import java.math.BigDecimal;
 import java.util.Date;
 import java.util.Optional;
 
 import org.apache.poi.ooxml.POIXMLProperties;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
-import 
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
 import 
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.DublinCore;
@@ -37,10 +43,12 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.SummaryExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor;
 import 
org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.tika.utils.XMLReaderUtils;
 
 /**
  * OOXML metadata extractor.
@@ -51,6 +59,28 @@ import 
org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
  */
 public class MetadataExtractor {
 
+    private static final String CUSTOM_PROPERTIES_REL =
+            
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";;
+
+    /**
+     * Hard cap on the accumulated text-content of a single property element
+     * inside docProps/custom.xml. Real OOXML property values are at most a few
+     * hundred bytes; anything beyond this is either corruption or an attacker
+     * trying to drive memory or CPU pressure (cf. the {@code <vt:decimal>}
+     * BigDecimal DoS where a 1M-digit literal compresses ~1000:1 in deflate).
+     * 64 KB leaves headroom for any legitimate value while bounding the
+     * slow-path inputs decisively.
+     */
+    static final int MAX_TEXT_BUFFER_LENGTH = 64 * 1024;
+
+    /**
+     * Hard cap on the {@code <vt:decimal>} text length passed to
+     * {@link BigDecimal#BigDecimal(String)}. JDK 17's parser is O(n²) in the
+     * digit count, so even a 64 KB string costs noticeable CPU. Real-world
+     * decimal values fit in well under 50 digits; 256 is generous.
+     */
+    static final int MAX_DECIMAL_LENGTH = 256;
+
     private final POIXMLTextExtractor extractor;
 
     public MetadataExtractor(POIXMLTextExtractor extractor) {
@@ -65,7 +95,13 @@ public class MetadataExtractor {
                         extractor instanceof XPSTextExtractor) && 
extractor.getPackage() != null)) {
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);
-            extractMetadata(extractor.getCustomProperties(), metadata);
+            // Custom properties are read via SAX directly from the OPC part
+            // rather than through POI/XMLBeans. The XMLBeans path materializes
+            // an attacker-controlled <vt:decimal> through BigDecimal(String),
+            // which is O(n²) on JDK 17 -- a 3 KB crafted carrier with a
+            // 1,000,000-digit literal burns ~25 s of CPU before this method
+            // even returns. See ooxml-bigdecimal-dos.
+            extractCustomPropertiesViaSAX(extractor.getPackage(), metadata);
         }
     }
 
@@ -157,85 +193,164 @@ public class MetadataExtractor {
         }
     }
 
-    private void extractMetadata(POIXMLProperties.CustomProperties properties, 
Metadata metadata) {
-        
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties 
props =
-                properties.getUnderlyingProperties();
-        for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
-            CTProperty property = props.getPropertyArray(i);
-            String val = null;
-            Date date = null;
-
-            if (property.isSetLpwstr()) {
-                val = property.getLpwstr();
-            } else if (property.isSetLpstr()) {
-                val = property.getLpstr();
-            } else if (property.isSetDate()) {
-                date = property.getDate().getTime();
-            } else if (property.isSetFiletime()) {
-                date = property.getFiletime().getTime();
-            } else if (property.isSetBool()) {
-                val = Boolean.toString(property.getBool());
+    /**
+     * Parse {@code docProps/custom.xml} directly via SAX, bypassing
+     * POI/XMLBeans. The XMLBeans path materializes an attacker-controlled
+     * {@code <vt:decimal>} through {@link BigDecimal#BigDecimal(String)}
+     * during XML deserialization, which is O(n²) in the digit count on
+     * JDK 17. By reading the part ourselves we can cap both the buffered
+     * text content ({@link #MAX_TEXT_BUFFER_LENGTH}) and the decimal
+     * literal length ({@link #MAX_DECIMAL_LENGTH}) before any slow parse
+     * runs.
+     */
+    private void extractCustomPropertiesViaSAX(OPCPackage opcPackage, Metadata 
metadata) {
+        if (opcPackage == null) {
+            return;
+        }
+        try {
+            PackagePart custPart = getRelatedPart(opcPackage, 
CUSTOM_PROPERTIES_REL);
+            if (custPart == null) {
+                return;
+            }
+            CustomPropertiesHandler handler = new CustomPropertiesHandler();
+            try (InputStream is = custPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, handler, new ParseContext());
             }
+            handler.applyTo(metadata);
+        } catch (Exception e) {
+            //swallow
+        }
+    }
 
-            // Integers
-            else if (property.isSetI1()) {
-                val = Integer.toString(property.getI1());
-            } else if (property.isSetI2()) {
-                val = Integer.toString(property.getI2());
-            } else if (property.isSetI4()) {
-                val = Integer.toString(property.getI4());
-            } else if (property.isSetI8()) {
-                val = Long.toString(property.getI8());
-            } else if (property.isSetInt()) {
-                val = Integer.toString(property.getInt());
+    private static PackagePart getRelatedPart(OPCPackage opcPackage, String 
relationshipType) {
+        try {
+            PackageRelationshipCollection rels =
+                    opcPackage.getRelationshipsByType(relationshipType);
+            if (rels == null || rels.size() == 0) {
+                return null;
             }
+            PackageRelationship rel = rels.getRelationship(0);
+            if (rel == null) {
+                return null;
+            }
+            return opcPackage.getPart(rel);
+        } catch (Exception e) {
+            return null;
+        }
+    }
 
-            // Unsigned Integers
-            else if (property.isSetUi1()) {
-                val = Integer.toString(property.getUi1());
-            } else if (property.isSetUi2()) {
-                val = Integer.toString(property.getUi2());
-            } else if (property.isSetUi4()) {
-                val = Long.toString(property.getUi4());
-            } else if (property.isSetUi8()) {
-                val = property.getUi8().toString();
-            } else if (property.isSetUint()) {
-                val = Long.toString(property.getUint());
+    /**
+     * Append SAX {@code characters()} content to {@code buf}, but stop 
accepting
+     * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+     * silently dropped; truncated values still flow through downstream 
parsing.
+     */
+    static void appendCapped(StringBuilder buf, char[] ch, int start, int 
length) {
+        if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+            return;
+        }
+        int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+        buf.append(ch, start, Math.min(length, remaining));
+    }
+
+    /**
+     * SAX handler for {@code docProps/custom.xml} (custom properties).
+     * Matches the schema defined by Microsoft's
+     * {@code 
http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}
+     * namespace, with value types coming from the {@code vt:} namespace.
+     */
+    static class CustomPropertiesHandler extends DefaultHandler {
+
+        private static final String VT_NS =
+                
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";;
+
+        private final Metadata customMetadata = new Metadata();
+        private String currentPropertyName;
+        private String currentValueType;
+        private final StringBuilder textBuffer = new StringBuilder();
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts) {
+            if ("property".equals(localName)) {
+                currentPropertyName = atts.getValue("name");
+                currentValueType = null;
+            } else if (VT_NS.equals(uri) && currentPropertyName != null) {
+                currentValueType = localName;
+                textBuffer.setLength(0);
             }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) {
+            appendCapped(textBuffer, ch, start, length);
+        }
 
-            // Reals
-            else if (property.isSetR4()) {
-                val = Float.toString(property.getR4());
-            } else if (property.isSetR8()) {
-                val = Double.toString(property.getR8());
-            } else if (property.isSetDecimal()) {
-                BigDecimal d = property.getDecimal();
-                if (d == null) {
-                    val = null;
-                } else {
-                    val = d.toPlainString();
+        @Override
+        public void endElement(String uri, String localName, String qName) {
+            if (VT_NS.equals(uri) && currentValueType != null &&
+                    localName.equals(currentValueType) && currentPropertyName 
!= null) {
+                String val = textBuffer.toString().trim();
+                String propName = "custom:" + currentPropertyName;
+                switch (currentValueType) {
+                    case "lpwstr":
+                    case "lpstr":
+                    case "bstr":
+                        customMetadata.set(propName, val);
+                        break;
+                    case "filetime":
+                    case "date":
+                        Property tikaProp = Property.externalDate(propName);
+                        customMetadata.set(tikaProp, val);
+                        break;
+                    case "bool":
+                        customMetadata.set(propName, val);
+                        break;
+                    case "i1":
+                    case "i2":
+                    case "i4":
+                    case "int":
+                    case "ui1":
+                    case "ui2":
+                        customMetadata.set(propName, val);
+                        break;
+                    case "i8":
+                    case "ui4":
+                    case "ui8":
+                    case "uint":
+                        customMetadata.set(propName, val);
+                        break;
+                    case "r4":
+                    case "r8":
+                        customMetadata.set(propName, val);
+                        break;
+                    case "decimal":
+                        // BigDecimal(String) is O(n²) on JDK 17; cap the input
+                        // length to keep an attacker-controlled <vt:decimal>
+                        // from burning CPU. Real values are < 50 chars; 256 is
+                        // generous. See ooxml-bigdecimal-dos.
+                        if (val.length() > MAX_DECIMAL_LENGTH) {
+                            break;
+                        }
+                        try {
+                            BigDecimal d = new BigDecimal(val);
+                            customMetadata.set(propName, d.toPlainString());
+                        } catch (NumberFormatException e) {
+                            //swallow
+                        }
+                        break;
+                    default:
+                        break;
                 }
-            } else if (property.isSetArray()) {
-                // TODO Fetch the array values and output
-            } else if (property.isSetVector()) {
-                // TODO Fetch the vector values and output
-            } else if (property.isSetBlob() || property.isSetOblob()) {
-                // TODO Decode, if possible
-            } else if (property.isSetStream() || property.isSetOstream() ||
-                    property.isSetVstream()) {
-                // TODO Decode, if possible
-            } else if (property.isSetStorage() || property.isSetOstorage()) {
-                // TODO Decode, if possible
-            } else {
-                // This type isn't currently supported yet, skip the property
+                currentValueType = null;
+            } else if ("property".equals(localName)) {
+                currentPropertyName = null;
             }
+        }
 
-            String propName = "custom:" + property.getName();
-            if (date != null) {
-                Property tikaProp = Property.externalDate(propName);
-                metadata.set(tikaProp, date);
-            } else if (val != null) {
-                metadata.set(propName, val);
+        void applyTo(Metadata metadata) {
+            for (String name : customMetadata.names()) {
+                for (String value : customMetadata.getValues(name)) {
+                    metadata.add(name, value);
+                }
             }
         }
     }

Reply via email to