This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new cabd1f2d44 fix potential sax dos (#2838)
cabd1f2d44 is described below

commit cabd1f2d440a80e29f29c0b467ab96caf69aaa30
Author: Tim Allison <[email protected]>
AuthorDate: Wed May 27 10:26:21 2026 -0400

    fix potential sax dos (#2838)
    
    BigDecimal dos found and reported by @tonghuaroot.
---
 .../microsoft/ooxml/SAXBasedMetadataExtractor.java |  89 +++++++--
 .../ooxml/SAXBasedMetadataExtractorTest.java       | 216 +++++++++++++++++++++
 2 files changed, 292 insertions(+), 13 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
index 632eb0c6c8..df3c6aeb38 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractor.java
@@ -56,6 +56,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
     private static final String CUSTOM_PROPERTIES_REL =
             
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";;
 
+    /**
+     * Hard cap on the accumulated text-content of a single property element.
+     * Real OOXML property values are at most a few hundred bytes; anything 
beyond
+     * this is either corruption or an attacker trying to drive memory or CPU
+     * pressure (cf. the {@code <vt:decimal>} BigDecimal DoS where a 1M-digit
+     * literal compresses ~1000:1 in deflate). 64 KB leaves headroom for any
+     * legitimate value while bounding the slow-path inputs decisively.
+     */
+    static final int MAX_TEXT_BUFFER_LENGTH = 64 * 1024;
+
+    /**
+     * Hard cap on the {@code <vt:decimal>} text length passed to
+     * {@link BigDecimal#BigDecimal(String)}. JDK 17's parser is O(n²) in the
+     * digit count, so even a 64 KB string costs noticeable CPU. Real-world
+     * decimal values fit in well under 50 digits; 256 is generous.
+     */
+    static final int MAX_DECIMAL_LENGTH = 256;
+
     private final OPCPackage opcPackage;
     private final ParseContext parseContext;
 
@@ -186,10 +204,24 @@ class SAXBasedMetadataExtractor extends MetadataExtractor 
{
         SummaryExtractor.addMulti(metadata, property, value.get());
     }
 
+    /**
+     * Append SAX {@code characters()} content to {@code buf}, but stop 
accepting
+     * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+     * silently dropped; truncated values still flow through downstream parsing
+     * (which will either accept the prefix or reject it as a 
NumberFormatException).
+     */
+    static void appendCapped(StringBuilder buf, char[] ch, int start, int 
length) {
+        if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+            return;
+        }
+        int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+        buf.append(ch, start, Math.min(length, remaining));
+    }
+
     /**
      * SAX handler for docProps/app.xml (extended properties).
      */
-    private static class ExtendedPropertiesHandler extends DefaultHandler {
+    static class ExtendedPropertiesHandler extends DefaultHandler {
 
         private String application;
         private String appVersion;
@@ -219,7 +251,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
 
         @Override
         public void characters(char[] ch, int start, int length) {
-            textBuffer.append(ch, start, length);
+            appendCapped(textBuffer, ch, start, length);
         }
 
         @Override
@@ -364,7 +396,7 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
     /**
      * SAX handler for docProps/custom.xml (custom properties).
      */
-    private static class CustomPropertiesHandler extends DefaultHandler {
+    static class CustomPropertiesHandler extends DefaultHandler {
 
         private static final String VT_NS =
                 
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";;
@@ -379,7 +411,14 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
             if ("property".equals(localName)) {
                 currentPropertyName = atts.getValue("name");
                 currentValueType = null;
-            } else if (VT_NS.equals(uri) && currentPropertyName != null) {
+            } else if (VT_NS.equals(uri) && currentPropertyName != null
+                    && currentValueType == null) {
+                // First vt: child under <property> wins. The == null guard 
keeps
+                // <vt:vector>/<vt:array> containers latched as the type so 
their
+                // inner children (vt:lpstr, vt:i4, ...) don't get re-emitted 
as
+                // a scalar custom property. The container itself falls through
+                // the endElement switch's default branch (no emit), matching 
the
+                // prior POI path that explicitly skipped vector/array.
                 currentValueType = localName;
                 textBuffer.setLength(0);
             }
@@ -387,28 +426,41 @@ class SAXBasedMetadataExtractor extends MetadataExtractor 
{
 
         @Override
         public void characters(char[] ch, int start, int length) {
-            textBuffer.append(ch, start, length);
+            appendCapped(textBuffer, ch, start, length);
         }
 
         @Override
         public void endElement(String uri, String localName, String qName) {
             if (VT_NS.equals(uri) && currentValueType != null &&
                     localName.equals(currentValueType) && currentPropertyName 
!= null) {
-                String val = textBuffer.toString().trim();
+                // Legacy POI's typed accessors (getLpwstr/getLpstr/getBstr) 
returned
+                // the raw element text, while numeric/date/bool accessors 
yielded
+                // already-normalized forms. Mirror that here: keep whitespace 
in
+                // strings, work with the trimmed form everywhere else.
+                String raw = textBuffer.toString();
+                String trimmed = raw.trim();
                 String propName = "custom:" + currentPropertyName;
                 switch (currentValueType) {
                     case "lpwstr":
                     case "lpstr":
                     case "bstr":
-                        customMetadata.set(propName, val);
+                        customMetadata.set(propName, raw);
                         break;
                     case "filetime":
                     case "date":
                         Property tikaProp = Property.externalDate(propName);
-                        customMetadata.set(tikaProp, val);
+                        customMetadata.set(tikaProp, trimmed);
                         break;
                     case "bool":
-                        customMetadata.set(propName, val);
+                        // xs:boolean lexical space is {true,false,1,0}. 
Legacy POI
+                        // routed through Boolean.toString(getBool()) so 
consumers
+                        // doing "true".equals(...) never saw the 1/0 form. 
Anything
+                        // outside the lexical space is dropped, not stored 
verbatim.
+                        if ("1".equals(trimmed) || "true".equals(trimmed)) {
+                            customMetadata.set(propName, "true");
+                        } else if ("0".equals(trimmed) || 
"false".equals(trimmed)) {
+                            customMetadata.set(propName, "false");
+                        }
                         break;
                     case "i1":
                     case "i2":
@@ -416,21 +468,28 @@ class SAXBasedMetadataExtractor extends MetadataExtractor 
{
                     case "int":
                     case "ui1":
                     case "ui2":
-                        customMetadata.set(propName, val);
+                        customMetadata.set(propName, trimmed);
                         break;
                     case "i8":
                     case "ui4":
                     case "ui8":
                     case "uint":
-                        customMetadata.set(propName, val);
+                        customMetadata.set(propName, trimmed);
                         break;
                     case "r4":
                     case "r8":
-                        customMetadata.set(propName, val);
+                        customMetadata.set(propName, trimmed);
                         break;
                     case "decimal":
+                        // BigDecimal(String) is O(n²) on JDK 17; cap the input
+                        // length to keep an attacker-controlled <vt:decimal>
+                        // from burning CPU. Real values are < 50 chars; 256 is
+                        // generous. See ooxml-bigdecimal-dos.
+                        if (trimmed.length() > MAX_DECIMAL_LENGTH) {
+                            break;
+                        }
                         try {
-                            BigDecimal d = new BigDecimal(val);
+                            BigDecimal d = new BigDecimal(trimmed);
                             customMetadata.set(propName, d.toPlainString());
                         } catch (NumberFormatException e) {
                             //swallow
@@ -442,6 +501,10 @@ class SAXBasedMetadataExtractor extends MetadataExtractor {
                 currentValueType = null;
             } else if ("property".equals(localName)) {
                 currentPropertyName = null;
+                // Defensive: if a malformed custom.xml left a vt: container 
open
+                // (e.g. <vt:vector> without a matching close before 
</property>),
+                // make sure the next property doesn't inherit it.
+                currentValueType = null;
             }
         }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
new file mode 100644
index 0000000000..99a89f3ef2
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SAXBasedMetadataExtractorTest.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * Tests for length-cap defenses in {@link SAXBasedMetadataExtractor}.
+ * <p>
+ * Both caps target attacker-controlled docProps/custom.xml. A 3 KB OOXML
+ * carrier whose {@code <vt:decimal>} contains a 1,000,000-digit numeric
+ * literal would otherwise burn ~25 s of CPU per file in JDK 17's
+ * {@code BigDecimal(String)} (O(n²)).
+ */
+public class SAXBasedMetadataExtractorTest {
+
+    private static final String CUSTOM_HEADER = "<?xml version=\"1.0\"?>"
+            + "<Properties 
xmlns=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/custom-properties\""
+            + " xmlns:vt=\"http://schemas.openxmlformats.org/officeDocument";
+            + "/2006/docPropsVTypes\">";
+    private static final String CUSTOM_FOOTER = "</Properties>";
+
+    @Test
+    public void appendCappedTruncatesAtLimit() {
+        StringBuilder buf = new StringBuilder();
+        char[] giant = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH + 10_000];
+        java.util.Arrays.fill(giant, '9');
+
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, giant.length);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "buffer must be capped at MAX_TEXT_BUFFER_LENGTH");
+
+        // Further appends after the cap are silent no-ops.
+        SAXBasedMetadataExtractor.appendCapped(buf, giant, 0, 100);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "appends past the cap must be silently dropped");
+    }
+
+    @Test
+    public void appendCappedRespectsRemainingRoom() {
+        StringBuilder buf = new StringBuilder();
+        // Pre-fill to one short of the cap; next 3 chars should partially 
land.
+        char[] padding = new 
char[SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH - 1];
+        java.util.Arrays.fill(padding, 'x');
+        buf.append(padding);
+
+        SAXBasedMetadataExtractor.appendCapped(buf, new char[]{'a', 'b', 'c'}, 
0, 3);
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
buf.length(),
+                "remaining room (1 char) must be filled; overflow dropped");
+        assertEquals('a', buf.charAt(buf.length() - 1));
+    }
+
+    @Test
+    public void normalDecimalIsExtracted() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("price", "decimal", 
"1234.56"));
+        assertEquals("1234.56", m.get("custom:price"));
+    }
+
+    @Test
+    public void decimalAtMaxLengthIsAccepted() throws Exception {
+        // Boundary: exactly MAX_DECIMAL_LENGTH digits must still parse. This 
is
+        // the upper edge of the accept-and-parse path.
+        String digits = 
"9".repeat(SAXBasedMetadataExtractor.MAX_DECIMAL_LENGTH);
+        Metadata m = parseCustomProperties(customProperty("ok", "decimal", 
digits));
+        assertEquals(digits, m.get("custom:ok"),
+                "decimal of exactly MAX_DECIMAL_LENGTH digits must 
round-trip");
+    }
+
+    @Test
+    public void decimalOneOverMaxLengthIsRejected() throws Exception {
+        // Boundary: one character past the cap must short-circuit before
+        // BigDecimal(String). Combined with appendCappedTruncatesAtLimit this
+        // mechanically proves the O(n²) parser is never invoked above the cap.
+        String digits = 
"9".repeat(SAXBasedMetadataExtractor.MAX_DECIMAL_LENGTH + 1);
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
digits));
+        assertNull(m.get("custom:evil"),
+                "decimal one char over MAX_DECIMAL_LENGTH must be rejected, 
not parsed");
+    }
+
+    @Test
+    public void oversizedDecimalAttackPayloadIsRejected() throws Exception {
+        // Reporter's actual attack shape: 1,000,000 digits. The SAX read
+        // truncates accumulation at MAX_TEXT_BUFFER_LENGTH (64 KB) via
+        // appendCapped, then the decimal-length check rejects the truncated
+        // value before BigDecimal(String) runs. No wall-clock assertion —
+        // the boundary tests above are the mechanical proof that
+        // BigDecimal is never called on an oversized payload.
+        String attackDigits = "9".repeat(1_000_000);
+        Metadata m = parseCustomProperties(customProperty("evil", "decimal", 
attackDigits));
+        assertNull(m.get("custom:evil"),
+                "1M-digit attack payload must be rejected without parsing");
+    }
+
+    @Test
+    public void oversizedStringIsTruncatedNotRejected() throws Exception {
+        // A large lpwstr isn't a CPU-DoS like decimal, but unbounded text
+        // accumulation would still be a memory pressure vector. The buffer
+        // cap stops accumulation at 64 KB; the truncated value still flows.
+        String giantString = "a".repeat(200_000);
+        Metadata m = parseCustomProperties(customProperty("bigstr", "lpwstr", 
giantString));
+        String got = m.get("custom:bigstr");
+        assertNotNull(got, "string-typed property survives truncation");
+        assertEquals(SAXBasedMetadataExtractor.MAX_TEXT_BUFFER_LENGTH, 
got.length(),
+                "string value must be capped at MAX_TEXT_BUFFER_LENGTH");
+    }
+
+    @Test
+    public void stringValuesPreserveLeadingAndTrailingWhitespace() throws 
Exception {
+        // Legacy POI's getLpwstr/getLpstr/getBstr returned the raw element 
text;
+        // anything that depends on whitespace inside the value would regress 
if
+        // the SAX path silently trimmed.
+        Metadata m = parseCustomProperties(
+                customProperty("padded", "lpwstr", "  hello world  "));
+        assertEquals("  hello world  ", m.get("custom:padded"),
+                "lpwstr must preserve leading/trailing whitespace");
+
+        m = parseCustomProperties(customProperty("padded", "lpstr", " ascii 
"));
+        assertEquals(" ascii ", m.get("custom:padded"));
+
+        m = parseCustomProperties(customProperty("padded", "bstr", 
"\tindented\n"));
+        assertEquals("\tindented\n", m.get("custom:padded"));
+    }
+
+    @Test
+    public void boolLexicalOneIsNormalizedToTrue() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("flag", "bool", 
"1"));
+        assertEquals("true", m.get("custom:flag"),
+                "<vt:bool>1</vt:bool> must normalize to \"true\" (matching 
legacy POI)");
+    }
+
+    @Test
+    public void boolLexicalZeroIsNormalizedToFalse() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("flag", "bool", 
"0"));
+        assertEquals("false", m.get("custom:flag"),
+                "<vt:bool>0</vt:bool> must normalize to \"false\" (matching 
legacy POI)");
+    }
+
+    @Test
+    public void boolLexicalTrueAndFalsePassThrough() throws Exception {
+        Metadata m = parseCustomProperties(customProperty("flag", "bool", 
"true"));
+        assertEquals("true", m.get("custom:flag"));
+
+        m = parseCustomProperties(customProperty("flag", "bool", "false"));
+        assertEquals("false", m.get("custom:flag"));
+    }
+
+    @Test
+    public void vectorContainingScalarIsNotEmittedAsScalar() throws Exception {
+        // <vt:vector> with inner <vt:lpstr> children. The container latches as
+        // the value type; inner children must NOT overwrite it, and the
+        // container itself must not be emitted as a scalar. Legacy POI skipped
+        // vector/array entirely.
+        String xml = CUSTOM_HEADER
+                + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" 
pid=\"2\""
+                + " name=\"tags\">"
+                + "<vt:vector size=\"2\" baseType=\"lpstr\">"
+                + "<vt:lpstr>alpha</vt:lpstr>"
+                + "<vt:lpstr>beta</vt:lpstr>"
+                + "</vt:vector>"
+                + "</property>"
+                + CUSTOM_FOOTER;
+        Metadata m = parseCustomProperties(xml);
+        assertNull(m.get("custom:tags"),
+                "vector container must not emit a scalar custom property");
+    }
+
+    // ===== helpers =====
+
+    private static String customProperty(String name, String type, String 
value) {
+        return CUSTOM_HEADER
+                + "<property fmtid=\"{DEADBEEF-0000-0000-0000-000000000000}\" 
pid=\"2\""
+                + " name=\"" + name + "\">"
+                + "<vt:" + type + ">" + value + "</vt:" + type + ">"
+                + "</property>"
+                + CUSTOM_FOOTER;
+    }
+
+    private static Metadata parseCustomProperties(String xml) throws Exception 
{
+        SAXBasedMetadataExtractor.CustomPropertiesHandler handler =
+                new SAXBasedMetadataExtractor.CustomPropertiesHandler();
+        try (InputStream is = new ByteArrayInputStream(
+                xml.getBytes(StandardCharsets.UTF_8))) {
+            XMLReaderUtils.parseSAX(is, handler, new ParseContext());
+        }
+        Metadata metadata = new Metadata();
+        handler.applyTo(metadata);
+        return metadata;
+    }
+}

Reply via email to