This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4744
in repository https://gitbox.apache.org/repos/asf/tika.git

commit edafcbb16989f0c69e2ddd0698b1ceaf33b13a8e
Author: tallison <[email protected]>
AuthorDate: Thu May 28 09:31:40 2026 -0400

    TIKA-4744 - fix odt tags
---
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |  48 ++++++++++++++-------
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  14 ++++++
 .../testODT_svgTitleInStyledSpan.odt               | Bin 0 -> 6820 bytes
 3 files changed, 46 insertions(+), 16 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 6bbc7eaa86..60c1457d68 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -337,19 +337,31 @@ class OpenDocumentBodyHandler extends 
ElementMappingContentHandler {
     }
 
     /**
-     * Returns true for ODF elements that map to block-level XHTML and so
-     * shouldn't sit inside open inline-style tags. When such an element opens
-     * while {@code <b>/<i>/<u>} are on the SAX stack, the inline tags would
-     * trap the new block element underneath them; subsequent style flips
-     * inside the block would emit close events that don't match the topmost
-     * open element. The startElement handler closes pending style tags
-     * before opening any of these.
+     * Returns true for ODF elements that shouldn't sit inside open 
inline-style
+     * tags. When such an element opens while {@code <b>/<i>/<u>} are on the 
SAX
+     * stack, the inline tags would trap the new element underneath them;
+     * subsequent style flips inside would emit close events that don't match
+     * the topmost open element. The startElement handler closes pending style
+     * tags before opening any of these.
      * <p>
+     * Two cases qualify:
+     * <ul>
+     *   <li>Block-level XHTML targets (draw:text-box, table/row/cell, 
list-item)
+     *       — opening a block under inline styles produces malformed XHTML 
even
+     *       if the SAX stream happened to balance.</li>
+     *   <li>svg:title / svg:desc — empty or near-empty inline elements that 
map
+     *       to {@code <span>} via MAPPINGS. When their parent {@code 
<text:span>}
+     *       had a bold/italic/underline style, the outer {@code <b>/<i>/<u>}
+     *       is still on top of the SAX stack when the svg's {@code <span>}
+     *       opens; the existing endElement closeStyleTags then tries to close
+     *       {@code </b>} while the svg span is topmost, which the strict
+     *       validator (correctly) rejects.</li>
+     * </ul>
      * text:p / text:h / text:list / annotation / note / notes / a are handled
      * by their own branches in startElement and never reach the default
      * branch where this check is used.
      */
-    private static boolean isBlockLevelOpen(String uri, String localName) {
+    private static boolean closeStylesBeforeOpen(String uri, String localName) 
{
         if (DRAW_NS.equals(uri) && "text-box".equals(localName)) {
             return true;
         }
@@ -358,6 +370,9 @@ class OpenDocumentBodyHandler extends 
ElementMappingContentHandler {
                         || "table-cell".equals(localName))) {
             return true;
         }
+        if (SVG_NS.equals(uri) && ("title".equals(localName) || 
"desc".equals(localName))) {
+            return true;
+        }
         return TEXT_NS.equals(uri) && "list-item".equals(localName);
     }
 
@@ -486,14 +501,15 @@ class OpenDocumentBodyHandler extends 
ElementMappingContentHandler {
                 // inside it. See updateStyleTags / closeStyleTags.
                 anchorDepth++;
                 super.startElement(namespaceURI, localName, qName, attrs);
-            } else if (isBlockLevelOpen(namespaceURI, localName)) {
-                // Block-level structural elements (draw:text-box -> <div>,
-                // table:table -> <table>, etc.) opened while <b>/<i>/<u> are
-                // on top would trap those inline tags. Subsequent style flips
-                // inside would emit </b> while the block is on top, producing
-                // cross-nested XHTML. Close pending styles before opening the
-                // block; if there's still text to emit at the same style after
-                // the block closes, updateStyleTags() will reopen them.
+            } else if (closeStylesBeforeOpen(namespaceURI, localName)) {
+                // Elements that mustn't open under <b>/<i>/<u>: block-level
+                // structural elements (draw:text-box -> <div>, table:table ->
+                // <table>, etc.) and svg:title / svg:desc inline shells that
+                // map to <span>. Closing pending style tags first ensures the
+                // new element opens at body/paragraph/span level, not nested
+                // under stale inline styling. If there's still text to emit at
+                // the same style after the element closes, updateStyleTags()
+                // will reopen them.
                 closeStyleTags();
                 super.startElement(namespaceURI, localName, qName, attrs);
             } else {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index cded9ea148..bedcc038c2 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -365,6 +365,20 @@ public class ODFParserTest extends TikaTest {
                 xml);
     }
 
+    @Test //TIKA-4744
+    public void testSvgTitleInStyledSpan() throws Exception {
+        // Empty <svg:title/>/<svg:desc/> inside a <draw:connector> or
+        // <draw:custom-shape> that is itself wrapped in a styled <text:span>
+        // used to leave the SAX stack with the svg's <span> sitting above the
+        // outer <b>. The endElement closeStyleTags (TIKA-4728) then emitted
+        // </b> while <span> was topmost, which StrictXHTMLValidator rejects.
+        // getXML wraps the handler in StrictXHTMLValidator, so a desync would
+        // throw before any assertions ran.
+        String xml = getXML("testODT_svgTitleInStyledSpan.odt").xml;
+        assertContains("國立雲林科技大學國際", xml);
+        assertContains("學生簽章", xml);
+    }
+
     @Test
     public void testEmbedded() throws Exception {
         List<Metadata> metadataList = 
getRecursiveMetadata("testODTEmbedded.odt");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODT_svgTitleInStyledSpan.odt
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODT_svgTitleInStyledSpan.odt
new file mode 100755
index 0000000000..d2f09b5d69
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testODT_svgTitleInStyledSpan.odt
 differ

Reply via email to