This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4744 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 92220e060f68fc8e60bb1fa2071d418df94b2244 Author: tallison <[email protected]> AuthorDate: Thu May 28 10:05:53 2026 -0400 TIKA-4744 - fix rtf tags --- .../tika/parser/microsoft/rtf/TextExtractor.java | 37 +++++++++++++++++++--- .../tika/parser/microsoft/rtf/RTFParserTest.java | 17 ++++++++++ .../testRTF_nestedHyperlinkPageRef.rtf | 9 ++++++ 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java index 72913ae437..b4ad6ac544 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java @@ -311,6 +311,12 @@ final class TextExtractor { // Non-null if we've seen the url for a HYPERLINK but not yet // its text: private String pendingURL; + // Group depth at which the current <a href=...> was opened (in + // groupState.depth units). Used to defer </a> emission until we leave + // the fldrslt group that opened it, so a nested \field (e.g., PAGEREF + // inside a HYPERLINK's fldrslt) doesn't prematurely close the outer <a> + // via the fieldState==3 branch in processGroupEnd. -1 means no <a> open. + private int hyperlinkAnchorDepth = -1; // Used to process the sub-groups inside the upr // group: private int uprState = -1; @@ -1365,8 +1371,18 @@ final class TextExtractor { addOutputChar('\u201D'); } } else if (equals("fldinst")) { - fieldState = 1; - groupState.ignore = false; + if (fieldState == 0) { + fieldState = 1; + groupState.ignore = false; + } else { + // Nested \fldinst inside an outer field (e.g., PAGEREF inside + // a HYPERLINK's fldrslt). Suppress the nested instruction text + // and leave the outer fieldState/pendingURL untouched: the + // outer field's closing group still needs to see fieldState==3 + // to emit </a>. The accompanying \fldrslt of the nested field + // emits its display text into the outer hyperlink's <a>. + groupState.ignore = true; + } } else if (equals("fldrslt") && fieldState == 2) { assert pendingURL != null; lazyStartParagraph(); @@ -1375,6 +1391,10 @@ final class TextExtractor { out.startElement("", "a", "a", attrs); pendingURL = null; fieldState = 3; + // Remember which group depth owns this <a>. processGroupEnd only + // emits </a> once we leave this depth, so nested groups inside + // the fldrslt (e.g., a PAGEREF \field) don't trip the close early. + hyperlinkAnchorDepth = groupState.depth; groupState.ignore = false; } } @@ -1547,8 +1567,17 @@ final class TextExtractor { // inlined, but fail to record them in metadata // as a field value. } else if (fieldState == 3) { - end("a"); - fieldState = 0; + // Only close </a> once we've left the fldrslt group that opened + // it. groupState.depth here is the OUTER group we just restored + // to; if it's now below the recorded anchor depth, the fldrslt + // group has closed. This guards against nested-field group ends + // (e.g., a PAGEREF \fldinst inside the HYPERLINK fldrslt) closing + // the outer </a> too early. + if (hyperlinkAnchorDepth >= 0 && groupState.depth < hyperlinkAnchorDepth) { + end("a"); + fieldState = 0; + hyperlinkAnchorDepth = -1; + } } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java index 33243c963a..2c0a91bf9c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java @@ -60,6 +60,23 @@ public class RTFParserTest extends TikaTest { assertContains("indexation Word", content); } + @Test //TIKA-4744 + public void testNestedHyperlinkPageRef() throws Exception { + // A HYPERLINK field with a PAGEREF \field nested inside its \fldrslt + // used to leak: the nested \fldinst would overwrite fieldState=3 with + // 1, so the outer fldrslt's group close skipped the </a> emission and + // <a> stayed open. The cascade surfaced at endDocument as the strict + // validator complaining </body> didn't match topmost <p>. + // getXML wraps the handler in StrictXHTMLValidator, so any imbalance + // would throw before the assertions. + XMLResult r = getXML("testRTF_nestedHyperlinkPageRef.rtf"); + // PAGEREF result "42" should render INSIDE the outer hyperlink's <a>, + // not after a prematurely-closed </a>: + assertContains("<a href=\"#target\">42</a>", r.xml); + assertContains("Before", r.xml); + assertContains("after.", r.xml); + } + @Test public void testUmlautSpacesExtraction2() throws Exception { String content = getText("testRTFUmlautSpaces2.rtf"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTF_nestedHyperlinkPageRef.rtf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTF_nestedHyperlinkPageRef.rtf new file mode 100644 index 0000000000..250b56c558 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTF_nestedHyperlinkPageRef.rtf @@ -0,0 +1,9 @@ +{\rtf1\ansi +{\fonttbl{\f0 Arial;}} +\pard {\f0 Before } +{\field {\*\fldinst HYPERLINK "#target" }{\fldrslt +{\field {\*\fldinst { PAGEREF "target" } }{\fldrslt 42}} +}} +{\f0 after.} +\par +}
