This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4744 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 77b754e91bafa974397a4e16c45c384dcc9b3305 Author: tallison <[email protected]> AuthorDate: Thu May 28 10:14:05 2026 -0400 TIKA-4744 - fix pdf tags --- .../org/apache/tika/parser/pdf/XFAExtractor.java | 134 +++++++++++---------- .../org/apache/tika/parser/pdf/PDFParserTest.java | 25 ++++ .../test-documents/testPDF_malformedXFA.pdf | 27 +++++ 3 files changed, 123 insertions(+), 63 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java index 2ca42620d2..3bc4cd83d9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java @@ -74,77 +74,85 @@ class XFAExtractor { void extract(InputStream xfaIs, XHTMLContentHandler xhtml, Metadata m, ParseContext context) throws XMLStreamException, SAXException { + // The reader loop below can throw XMLStreamException on malformed XFA + // XML; the caller (AbstractPDF2XHTML.extractAcroForm) catches that and + // falls through to the AcroForm path, which emits more content. If the + // xfa_content div is left open at the throw point, that fallback + // content nests under it and the outer </body> can't balance. Wrap the + // body in try/finally so the open is always paired with a close. xhtml.startElement("div", "class", "xfa_content"); - - //TODO - replace this with multivalued map? This isn't - //actually metadata, just a handy data structure. - Metadata pdfObjRToValues = new Metadata(); - - //for now, store and dump the fields in insertion order - Map<String, XFAField> namedFields = new LinkedHashMap<>(); - - //The strategy is to cache the fields in fields - //and cache the values in pdfObjRToValues while - //handling the text etc along the way. - // - //As a final step, dump the merged fields and the values. - - XMLStreamReader reader = XMLReaderUtils.getXMLInputFactory(context).createXMLStreamReader(xfaIs); - while (reader.hasNext()) { - switch (reader.next()) { - case XMLStreamConstants.START_ELEMENT: - QName name = reader.getName(); - String localName = name.getLocalPart(); - if (xfaTemplateMatcher.reset(name.getNamespaceURI()).find() && - FIELD_LN.equals(name.getLocalPart())) { - handleField(reader, namedFields); - } else if (XFA_DATA.equals(name)) { //full qname match is important! - loadData(reader, pdfObjRToValues); - } else if (textMatcher.reset(localName).find()) { - scrapeTextUntil(reader, xhtml, name); - } - break; - case XMLStreamConstants.END_ELEMENT: - break; + try { + //TODO - replace this with multivalued map? This isn't + //actually metadata, just a handy data structure. + Metadata pdfObjRToValues = new Metadata(); + + //for now, store and dump the fields in insertion order + Map<String, XFAField> namedFields = new LinkedHashMap<>(); + + //The strategy is to cache the fields in fields + //and cache the values in pdfObjRToValues while + //handling the text etc along the way. + // + //As a final step, dump the merged fields and the values. + + XMLStreamReader reader = + XMLReaderUtils.getXMLInputFactory(context).createXMLStreamReader(xfaIs); + while (reader.hasNext()) { + switch (reader.next()) { + case XMLStreamConstants.START_ELEMENT: + QName name = reader.getName(); + String localName = name.getLocalPart(); + if (xfaTemplateMatcher.reset(name.getNamespaceURI()).find() && + FIELD_LN.equals(name.getLocalPart())) { + handleField(reader, namedFields); + } else if (XFA_DATA.equals(name)) { //full qname match is important! + loadData(reader, pdfObjRToValues); + } else if (textMatcher.reset(localName).find()) { + scrapeTextUntil(reader, xhtml, name); + } + break; + case XMLStreamConstants.END_ELEMENT: + break; + } } - } - if (namedFields.size() == 0) { - xhtml.endElement("div"); - return; - } - //now dump fields and values - xhtml.startElement("div", "class", "xfa_form"); - xhtml.startElement("ol"); - StringBuilder sb = new StringBuilder(); - for (Map.Entry<String, XFAField> e : namedFields.entrySet()) { - String fieldName = e.getKey(); - XFAField field = e.getValue(); - String displayFieldName = - (field.toolTip == null || field.toolTip.isBlank()) ? fieldName : - field.toolTip; - String[] fieldValues = pdfObjRToValues.getValues(fieldName); - if (fieldValues.length == 0) { - fieldValues = new String[]{""}; + if (namedFields.size() == 0) { + return; } - for (String fieldValue : fieldValues) { - AttributesImpl attrs = new AttributesImpl(); - attrs.addAttribute("", "fieldName", "fieldName", "CDATA", fieldName); - - sb.append(displayFieldName).append(": "); - if (fieldValue != null) { - sb.append(fieldValue); + //now dump fields and values + xhtml.startElement("div", "class", "xfa_form"); + xhtml.startElement("ol"); + StringBuilder sb = new StringBuilder(); + for (Map.Entry<String, XFAField> e : namedFields.entrySet()) { + String fieldName = e.getKey(); + XFAField field = e.getValue(); + String displayFieldName = + (field.toolTip == null || field.toolTip.isBlank()) ? fieldName : + field.toolTip; + String[] fieldValues = pdfObjRToValues.getValues(fieldName); + if (fieldValues.length == 0) { + fieldValues = new String[]{""}; } + for (String fieldValue : fieldValues) { + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "fieldName", "fieldName", "CDATA", fieldName); - xhtml.startElement("li", attrs); - xhtml.characters(sb.toString()); - xhtml.endElement("li"); - sb.setLength(0); + sb.append(displayFieldName).append(": "); + if (fieldValue != null) { + sb.append(fieldValue); + } + + xhtml.startElement("li", attrs); + xhtml.characters(sb.toString()); + xhtml.endElement("li"); + sb.setLength(0); + } } + xhtml.endElement("ol"); + xhtml.endElement("div"); + } finally { + xhtml.endElement("div"); } - xhtml.endElement("ol"); - xhtml.endElement("div"); - xhtml.endElement("div"); } //try to scrape the text until the endElement diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index e73471e353..ccae004def 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1037,6 +1037,31 @@ public class PDFParserTest extends TikaTest { assertNotContained("Mount Rushmore National Memorial", xml); } + @Test //TIKA-4744 + public void testMalformedXFADivBalanced() throws Exception { + // A PDF whose XFA stream is malformed used to leave <div class= + // "xfa_content"> open: XFAExtractor.extract opened the div first and + // then threw XMLStreamException from reader.next(), the caller logged + // and fell through to AcroForm fallback (which emitted its own divs + // nested inside the unclosed xfa_content), and endDocument failed + // with </body> not matching topmost <div>. + // getXML wraps the handler in StrictXHTMLValidator, so any imbalance + // would throw before the assertions. + XMLResult r = getXML("testPDF_malformedXFA.pdf"); + // Caller recorded the XMLStreamException as a warning rather than a + // fatal -- confirm that didn't change. + String[] warnings = r.metadata.getValues( + TikaCoreProperties.TIKA_META_EXCEPTION_WARNING); + boolean xfaWarningPresent = false; + for (String w : warnings) { + if (w.contains("XFAExtractor")) { + xfaWarningPresent = true; + break; + } + } + assertTrue(xfaWarningPresent, "expected an XFAExtractor warning"); + } + @Test public void testXMPMM() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_malformedXFA.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_malformedXFA.pdf new file mode 100755 index 0000000000..8158f02a2e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_malformedXFA.pdf @@ -0,0 +1,27 @@ +%PDF-1.7 +1 0 obj +<</Parent 2 0 R/Contents 6 0 R/Type/Page/Resources<<>>/MediaBox[0 0 1 1]>> +endobj +2 0 obj +<</Kids[1 0 R]/Type/Pages/Count 1>> +endobj +3 0 obj +<<>> +stream +<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><pdf/><template><subform use="xfa"> +endstream +endobj +4 0 obj +<</XFA 3 0 R>> +endobj +5 0 obj +<</AcroForm 4 0 R/Type/Catalog/Pages 2 0 R/NeedsRendering true>> +endobj +6 0 obj +<</Length 0>> +stream +endstream +endobj +trailer +<</Size 7/Root 5 0 R>> +%%EOF \ No newline at end of file
