This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 19b4c66927 TIKA-4728 - fix xhtml in widgets (#2817)
19b4c66927 is described below
commit 19b4c66927435a813770f514d020418c4199ef8f
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 21 13:20:01 2026 -0400
TIKA-4728 - fix xhtml in widgets (#2817)
---
.../ParsingEmbeddedDocumentExtractor.java | 29 ++-
.../tika/sax/BasicContentHandlerFactory.java | 30 +++
.../org/apache/tika/sax/StrictXHTMLValidator.java | 229 +++++++++++++++++++++
.../org/apache/tika/sax/XHTMLBalancingHandler.java | 123 +++++++++++
.../src/test/java/org/apache/tika/TikaTest.java | 37 +++-
.../apache/tika/sax/XHTMLBalancingHandlerTest.java | 130 ++++++++++++
.../tika/parser/iwork/PagesContentHandler.java | 19 +-
.../java/org/apache/tika/parser/prt/PRTParser.java | 85 ++++----
.../apache/tika/parser/code/SourceCodeParser.java | 25 +++
.../tika/parser/code/SourceCodeParserTest.java | 1 -
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 15 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 55 +++++
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 7 +
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 16 +-
.../ooxml/SXWPFWordExtractorDecorator.java | 6 +
.../ooxml/XSSFExcelExtractorDecorator.java | 36 ++++
.../org/apache/tika/parser/epub/EpubParser.java | 44 +++-
.../tika/parser/odf/OpenDocumentBodyHandler.java | 77 +++++++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 25 ++-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 14 ++
.../test-documents/testPDF_jsActionOnPage.pdf | 26 +++
.../java/org/apache/tika/parser/txt/TXTParser.java | 27 ++-
.../java/org/apache/tika/parser/tmx/TMXParser.java | 2 +
.../apache/tika/parser/xliff/XLIFF12Parser.java | 3 +-
24 files changed, 974 insertions(+), 87 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index 5852630d0b..194d863d2d 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -40,6 +40,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.SAXOutputConfig;
+import org.apache.tika.sax.XHTMLBalancingHandler;
/**
* Helper class for parsers of package archives or other compound document
@@ -163,11 +164,22 @@ public class ParsingEmbeddedDocumentExtractor implements
EmbeddedDocumentExtract
handler.endElement(XHTML, "h1", "h1");
}
+ // Wrap the delegate's handler so we can close anything it left open if
+ // it throws mid-element. Without this, the </div> emitted in finally
+ // could land on top of an open <p>/<table>/etc. from the failed
+ // sub-parse and produce malformed XHTML.
+ XHTMLBalancingHandler balancer =
+ outputHtml ? new XHTMLBalancingHandler(handler) : null;
+ ContentHandler delegateHandler = outputHtml ? balancer : handler;
+
// Use the delegate parser to parse this entry
+ boolean parsedCleanly = false;
try {
tis.setCloseShield();
- DELEGATING_PARSER.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(handler)),
+ DELEGATING_PARSER.parse(tis,
+ new EmbeddedContentHandler(new
BodyContentHandler(delegateHandler)),
metadata, context);
+ parsedCleanly = true;
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
} catch (CorruptedFileException e) {
@@ -178,10 +190,17 @@ public class ParsingEmbeddedDocumentExtractor implements
EmbeddedDocumentExtract
recordException(e, context);
} finally {
tis.removeCloseShield();
- }
-
- if (outputHtml) {
- handler.endElement(XHTML, "div", "div");
+ if (outputHtml) {
+ // Only an aborted parse can leave elements open; on a clean
parse
+ // the balancer stack is empty. Draining only on abort keeps
the
+ // package-entry div well-formed when the inner parse throws,
while
+ // letting StrictXHTMLValidator still catch genuine imbalances
on the
+ // happy path (TIKA-4728).
+ if (!parsedCleanly) {
+ balancer.drainOpenElements();
+ }
+ handler.endElement(XHTML, "div", "div");
+ }
}
}
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 337eba15ab..53d382b2e0 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -42,6 +42,7 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
private HANDLER_TYPE type = HANDLER_TYPE.MARKDOWN;
private int writeLimit = -1;
private boolean throwOnWriteLimitReached = true;
+ private boolean validateXHTML = false;
private transient ParseContext parseContext;
/**
@@ -83,6 +84,23 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
}
}
+ /**
+ * When true, every handler produced by this factory is wrapped in a
+ * {@link StrictXHTMLValidator} so that any malformed XHTML event sequence
+ * emitted by a parser throws a {@link org.xml.sax.SAXException} at the
+ * offending event. Defaults to false. Intended for catching parser bugs
+ * in development / corpus scans; off in production for performance and
+ * because not every caller wants validation failures to surface as
+ * parser exceptions.
+ */
+ public void setValidateXHTML(boolean validateXHTML) {
+ this.validateXHTML = validateXHTML;
+ }
+
+ public boolean isValidateXHTML() {
+ return validateXHTML;
+ }
+
/**
* Creates a new BasicContentHandlerFactory configured from OutputLimits
in the ParseContext.
* <p>
@@ -138,7 +156,10 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
@Override
public ContentHandler createHandler() {
+ return maybeValidate(createHandlerInner());
+ }
+ private ContentHandler createHandlerInner() {
if (type == HANDLER_TYPE.BODY) {
return new BodyContentHandler(
new WriteOutContentHandler(new ToTextContentHandler(),
writeLimit,
@@ -154,6 +175,10 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
parseContext);
}
+ private ContentHandler maybeValidate(ContentHandler h) {
+ return validateXHTML ? new StrictXHTMLValidator(h) : h;
+ }
+
private ContentHandler getFormatHandler() {
switch (type) {
case TEXT:
@@ -171,7 +196,10 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
@Override
public ContentHandler createHandler(OutputStream os, Charset charset) {
+ return maybeValidate(createHandlerInner(os, charset));
+ }
+ private ContentHandler createHandlerInner(OutputStream os, Charset
charset) {
if (type == HANDLER_TYPE.IGNORE) {
return new DefaultHandler();
}
@@ -292,6 +320,7 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
BasicContentHandlerFactory that = (BasicContentHandlerFactory) o;
return writeLimit == that.writeLimit &&
throwOnWriteLimitReached == that.throwOnWriteLimitReached &&
+ validateXHTML == that.validateXHTML &&
type == that.type;
}
@@ -300,6 +329,7 @@ public class BasicContentHandlerFactory implements
StreamingContentHandlerFactor
int result = type != null ? type.hashCode() : 0;
result = 31 * result + writeLimit;
result = 31 * result + (throwOnWriteLimitReached ? 1 : 0);
+ result = 31 * result + (validateXHTML ? 1 : 0);
return result;
}
}
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java
b/tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java
new file mode 100644
index 0000000000..12cf3b629f
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/StrictXHTMLValidator.java
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A SAX content handler decorator that enforces XHTML well-formedness on the
+ * incoming event stream. Any parser that emits an event sequence that would
+ * produce malformed XHTML triggers a {@link SAXException} synchronously — the
+ * stack trace points at the parser code that made the offending call, instead
+ * of surfacing later as a parse error on the serialized output.
+ * <p>
+ * Invariants enforced:
+ * <ul>
+ * <li>{@code startDocument} is called at most once.</li>
+ * <li>No SAX events arrive after {@code endDocument}.</li>
+ * <li>Every {@code endElement} matches the topmost open {@code startElement}
+ * (no cross-nesting like {@code
<a><b></a></b>}).</li>
+ * <li>The element stack is empty when {@code endDocument} fires (no unclosed
+ * elements left dangling by an exception path).</li>
+ * <li>Within a single {@code startElement}, no two attributes share the same
+ * (namespaceURI, localName) pair (the bug class that produces
+ * {@code <div class="x" class="y">}).</li>
+ * </ul>
+ * Use as a decorator wrapping the real handler. It passes every event through
+ * to the downstream handler after validation, so any normal text/XHTML capture
+ * still works.
+ */
+public class StrictXHTMLValidator extends ContentHandlerDecorator {
+
+ private final Deque<QName> openElements = new ArrayDeque<>();
+ private boolean documentStarted;
+ private boolean documentEnded;
+
+ public StrictXHTMLValidator(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ if (documentStarted) {
+ throw new SAXException("StrictXHTMLValidator: startDocument called
twice");
+ }
+ if (documentEnded) {
+ throw new SAXException(
+ "StrictXHTMLValidator: startDocument after endDocument");
+ }
+ documentStarted = true;
+ super.startDocument();
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ if (documentEnded) {
+ throw new SAXException("StrictXHTMLValidator: endDocument called
twice");
+ }
+ if (!openElements.isEmpty()) {
+ throw new SAXException(
+ "StrictXHTMLValidator: endDocument with " +
openElements.size()
+ + " unclosed element(s); topmost was <"
+ + openElements.peek().qOrLocal() + ">");
+ }
+ documentEnded = true;
+ super.endDocument();
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes attrs)
+ throws SAXException {
+ ensureNotEnded("startElement <" + display(qName, localName) + ">");
+ checkAttributesUnique(qName, localName, attrs);
+ openElements.push(new QName(uri, localName, qName));
+ super.startElement(uri, localName, qName, attrs);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws
SAXException {
+ ensureNotEnded("endElement </" + display(qName, localName) + ">");
+ if (openElements.isEmpty()) {
+ throw new SAXException(
+ "StrictXHTMLValidator: endElement </" + display(qName,
localName)
+ + "> with no matching startElement");
+ }
+ QName top = openElements.pop();
+ if (!top.matches(uri, localName, qName)) {
+ throw new SAXException(
+ "StrictXHTMLValidator: endElement </" + display(qName,
localName)
+ + "> does not match topmost open element <"
+ + top.qOrLocal() + ">");
+ }
+ super.endElement(uri, localName, qName);
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ ensureNotEnded("characters");
+ super.characters(ch, start, length);
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws
SAXException {
+ ensureNotEnded("ignorableWhitespace");
+ super.ignorableWhitespace(ch, start, length);
+ }
+
+ @Override
+ public void processingInstruction(String target, String data) throws
SAXException {
+ ensureNotEnded("processingInstruction");
+ super.processingInstruction(target, data);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws
SAXException {
+ ensureNotEnded("startPrefixMapping");
+ super.startPrefixMapping(prefix, uri);
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ ensureNotEnded("endPrefixMapping");
+ super.endPrefixMapping(prefix);
+ }
+
+ @Override
+ public void skippedEntity(String name) throws SAXException {
+ ensureNotEnded("skippedEntity");
+ super.skippedEntity(name);
+ }
+
+ private void ensureNotEnded(String event) throws SAXException {
+ if (documentEnded) {
+ throw new SAXException(
+ "StrictXHTMLValidator: " + event + " arrived after
endDocument");
+ }
+ }
+
+ private void checkAttributesUnique(String elementQName, String
elementLocalName,
+ Attributes attrs) throws SAXException {
+ int n = attrs.getLength();
+ if (n < 2) {
+ return;
+ }
+ // (uri, localName) pairs must be unique per the XML namespaces spec.
+ // We also check raw qnames because Tika's serializers emit by qname
and
+ // duplicate qnames produce malformed XHTML even when localnames
differ.
+ Set<String> seenUriLocal = new HashSet<>(n);
+ Set<String> seenQNames = new HashSet<>(n);
+ for (int i = 0; i < n; i++) {
+ String uri = nullSafe(attrs.getURI(i));
+ String local = nullSafe(attrs.getLocalName(i));
+ String qn = nullSafe(attrs.getQName(i));
+ // U+0001 cannot appear in a valid XML uri/localName, so it joins
the
+ // two unambiguously without risk of a key collision.
+ String key = uri + "\u0001" + local;
+ if (!seenUriLocal.add(key)) {
+ throw new SAXException(
+ "StrictXHTMLValidator: duplicate attribute on <"
+ + display(elementQName, elementLocalName) +
">: "
+ + (uri.isEmpty() ? local : ("{" + uri + "}" +
local)));
+ }
+ if (!qn.isEmpty() && !seenQNames.add(qn)) {
+ throw new SAXException(
+ "StrictXHTMLValidator: duplicate attribute qname on <"
+ + display(elementQName, elementLocalName) +
">: " + qn);
+ }
+ }
+ }
+
+ private static String nullSafe(String s) {
+ return s == null ? "" : s;
+ }
+
+ private static String display(String qName, String localName) {
+ if (qName != null && !qName.isEmpty()) {
+ return qName;
+ }
+ return localName == null ? "" : localName;
+ }
+
+ private static final class QName {
+ final String uri;
+ final String localName;
+ final String qName;
+
+ QName(String uri, String localName, String qName) {
+ this.uri = nullSafe(uri);
+ this.localName = nullSafe(localName);
+ this.qName = nullSafe(qName);
+ }
+
+ boolean matches(String u, String l, String q) {
+ // SAX parsers can vary in which fields they populate. Accept a
+ // match on either (uri, localName) or qName, whichever is present.
+ String otherU = nullSafe(u);
+ String otherL = nullSafe(l);
+ String otherQ = nullSafe(q);
+ boolean uriLocalMatch = uri.equals(otherU) &&
localName.equals(otherL)
+ && !localName.isEmpty();
+ boolean qNameMatch = !qName.isEmpty() && qName.equals(otherQ);
+ return uriLocalMatch || qNameMatch;
+ }
+
+ String qOrLocal() {
+ return qName.isEmpty() ? localName : qName;
+ }
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java
new file mode 100644
index 0000000000..14a0a9e9bc
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLBalancingHandler.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX decorator that tracks open elements so a parser can recover well-formed
+ * XHTML when an exception interrupts the SAX stream mid-element.
+ * <p>
+ * The decorator is a thin passthrough on the happy path: it pushes and pops an
+ * internal stack on {@code startElement}/{@code endElement} and otherwise
forwards
+ * every event to the wrapped handler unchanged. It deliberately does NOT mask
+ * bad event sequences (mismatched or excess endElement, duplicate attributes,
+ * etc.) -- those remain visible to {@link StrictXHTMLValidator} so parser bugs
+ * still surface as test failures.
+ * <p>
+ * The unhappy path -- a per-part SAX parser throwing mid-element after
emitting
+ * one or more start tags -- is handled via {@link #drainOpenElements()}, which
+ * emits a matching {@code endElement} (with the original uri/localName/qName)
+ * for every element still on the stack, in reverse open order. The wrapped
+ * handler is left in a well-formed state with no dangling elements from the
+ * failed sub-parse.
+ * <p>
+ * Typical use wraps the handler that receives events from an inner SAX parser,
+ * inside the catch arm that swallows the inner parser's exception:
+ * <pre>{@code
+ * XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(contentHandler);
+ * try {
+ * XMLReaderUtils.parseSAX(stream, new EmbeddedContentHandler(balancer),
context);
+ * } catch (SAXException e) {
+ * balancer.drainOpenElements();
+ * // ... log and continue ...
+ * }
+ * }</pre>
+ * This handler does not touch {@code startDocument}/{@code endDocument}; the
+ * caller still owns the document lifecycle.
+ */
+public class XHTMLBalancingHandler extends ContentHandlerDecorator {
+
+ private final Deque<QName> openElements = new ArrayDeque<>();
+
+ public XHTMLBalancingHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
Attributes attrs)
+ throws SAXException {
+ openElements.push(new QName(uri, localName, qName));
+ super.startElement(uri, localName, qName, attrs);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws
SAXException {
+ // Pop best-effort: an unbalanced endElement (e.g., emitted after the
+ // matching startElement was swallowed) still forwards downstream so a
+ // wrapping StrictXHTMLValidator sees the violation.
+ if (!openElements.isEmpty()) {
+ openElements.pop();
+ }
+ super.endElement(uri, localName, qName);
+ }
+
+ /**
+ * Emits a matching {@code endElement} for every element still on the open
+ * stack, in reverse open order. After this call the stack is empty.
+ * <p>
+ * Intended for the catch arm of a caller that swallowed a
+ * {@link SAXException} from an inner SAX parser: the inner parser may have
+ * left one or more elements open mid-stream, and downstream serialization
+ * needs matching closers before any further events.
+ * <p>
+ * Does NOT emit {@code endDocument} -- document lifecycle stays with the
+ * caller.
+ */
+ public void drainOpenElements() throws SAXException {
+ while (!openElements.isEmpty()) {
+ QName q = openElements.pop();
+ super.endElement(q.uri, q.localName, q.qName);
+ }
+ }
+
+ /**
+ * Number of elements currently open through this handler. Exposed for
+ * tests and for callers that want to know whether
+ * {@link #drainOpenElements()} would emit anything.
+ */
+ public int openElementCount() {
+ return openElements.size();
+ }
+
+ private static final class QName {
+ final String uri;
+ final String localName;
+ final String qName;
+
+ QName(String uri, String localName, String qName) {
+ this.uri = uri == null ? "" : uri;
+ this.localName = localName == null ? "" : localName;
+ this.qName = qName == null ? "" : qName;
+ }
+ }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a8e37a85b2..91f3c236c5 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -58,6 +58,7 @@ import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.StrictXHTMLValidator;
import org.apache.tika.sax.ToXMLContentHandler;
/**
@@ -66,6 +67,19 @@ import org.apache.tika.sax.ToXMLContentHandler;
public abstract class TikaTest {
protected static Parser AUTO_DETECT_PARSER = new AutoDetectParser();
+
+ /**
+ * Build a {@link BasicContentHandlerFactory} with the strict XHTML
+ * validator enabled so that any malformed XHTML emitted during a
+ * recursive parse throws at the offending SAX event. All TikaTest
+ * helpers route through this so individual tests don't have to opt in.
+ */
+ private static BasicContentHandlerFactory validatingFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE type) {
+ BasicContentHandlerFactory f = new BasicContentHandlerFactory(type,
-1);
+ f.setValidateXHTML(true);
+ return f;
+ }
public static void assertContainsCount(String needle, String haystack, int
targetCount) {
int i = haystack.indexOf(needle);
int count = 0;
@@ -330,9 +344,13 @@ public abstract class TikaTest {
}
try (input) {
- ContentHandler handler = new ToXMLContentHandler();
+ ToXMLContentHandler xml = new ToXMLContentHandler();
+ // Wrap with the strict validator so parsers that emit malformed
+ // XHTML (duplicate attrs, unclosed tags, cross-nested elements)
+ // throw at the offending SAX event with a parser stack frame.
+ ContentHandler handler = new StrictXHTMLValidator(xml);
parser.parse(input, handler, metadata, context);
- return new XMLResult(handler.toString(), metadata);
+ return new XMLResult(xml.toString(), metadata);
}
}
@@ -462,7 +480,7 @@ public abstract class TikaTest {
throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
- new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+
validatingFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML));
try (tis) {
wrapper.parse(tis, handler, metadata, context);
} catch (Exception e) {
@@ -479,7 +497,7 @@ public abstract class TikaTest {
throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(handlerType, -1));
+ validatingFactory(handlerType));
try (tis) {
wrapper.parse(tis, handler, metadata, context);
} catch (Exception e) {
@@ -495,7 +513,7 @@ public abstract class TikaTest {
RecursiveParserWrapper wrapper = new
RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
- new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+
validatingFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML));
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
FilenameUtils.getName(filePath));
try (TikaInputStream tis = getResourceAsStream("/test-documents/" +
filePath)) {
@@ -523,7 +541,7 @@ public abstract class TikaTest {
ParseContext context) throws
Exception {
RecursiveParserWrapper wrapper = new
RecursiveParserWrapper(parserToWrap);
RecursiveParserWrapperHandler handler =
- new RecursiveParserWrapperHandler(new
BasicContentHandlerFactory(handlerType, -1));
+ new
RecursiveParserWrapperHandler(validatingFactory(handlerType));
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
FilenameUtils.getName(filePath));
try (TikaInputStream tis = getResourceAsStream("/test-documents/" +
filePath)) {
@@ -536,7 +554,7 @@ public abstract class TikaTest {
ParseContext parseContext)
throws Exception {
RecursiveParserWrapper wrapper = new
RecursiveParserWrapper(parserToWrap);
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
- new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+
validatingFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML));
try (TikaInputStream tis = getResourceAsStream("/test-documents/" +
filePath)) {
wrapper.parse(tis, handler, new Metadata(), parseContext);
@@ -578,7 +596,8 @@ public abstract class TikaTest {
*/
public String getText(TikaInputStream tis, Parser parser, ParseContext
context, Metadata metadata)
throws Exception {
- ContentHandler handler = new BodyContentHandler(1000000);
+ BodyContentHandler body = new BodyContentHandler(1000000);
+ ContentHandler handler = new StrictXHTMLValidator(body);
try (tis) {
parser.parse(tis, handler, metadata, context);
} catch (SAXException e) {
@@ -586,7 +605,7 @@ public abstract class TikaTest {
throw e;
}
}
- return handler.toString();
+ return body.toString();
}
public String getText(TikaInputStream tis, Parser parser, Metadata
metadata) throws Exception {
diff --git
a/tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java
b/tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java
new file mode 100644
index 0000000000..3688616c42
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLBalancingHandlerTest.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class XHTMLBalancingHandlerTest {
+
+ private static AttributesImpl noAtts() {
+ return new AttributesImpl();
+ }
+
+ @Test
+ public void happyPathIsPassthrough() throws Exception {
+ ToXMLContentHandler out = new ToXMLContentHandler();
+ XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out);
+
+ balancer.startDocument();
+ balancer.startElement("", "p", "p", noAtts());
+ balancer.characters("hello".toCharArray(), 0, 5);
+ balancer.endElement("", "p", "p");
+ balancer.endDocument();
+
+ assertEquals(0, balancer.openElementCount());
+ // No drain needed -- stack should already be empty.
+ balancer.drainOpenElements();
+ // ToXMLContentHandler.toString() returns the serialized form.
+ String xml = out.toString();
+ assertEquals(true, xml.contains("<p>hello</p>"), xml);
+ }
+
+ @Test
+ public void drainClosesElementsInReverseOpenOrder() throws Exception {
+ ToXMLContentHandler out = new ToXMLContentHandler();
+ XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out);
+
+ balancer.startDocument();
+ balancer.startElement("", "div", "div", noAtts());
+ balancer.startElement("", "p", "p", noAtts());
+ balancer.startElement("", "span", "span", noAtts());
+ balancer.characters("oops".toCharArray(), 0, 4);
+
+ // Simulate exception mid-element: caller drains.
+ assertEquals(3, balancer.openElementCount());
+ balancer.drainOpenElements();
+ assertEquals(0, balancer.openElementCount());
+
+ balancer.endDocument();
+
+ String xml = out.toString();
+ // Expect </span></p></div> in that order.
+ int spanIdx = xml.indexOf("</span>");
+ int pIdx = xml.indexOf("</p>");
+ int divIdx = xml.indexOf("</div>");
+ assertEquals(true, spanIdx >= 0 && pIdx > spanIdx && divIdx > pIdx,
+ "expected </span></p></div> order, got: " + xml);
+ }
+
+ @Test
+ public void drainEmitsMatchingUriAndQName() throws Exception {
+ // Verifies the Copilot review point: endElement must carry the same
+ // (uri, localName, qName) tuple as the matching startElement.
+ ToXMLContentHandler out = new ToXMLContentHandler();
+ XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out);
+
+ balancer.startDocument();
+ balancer.startPrefixMapping("h", "http://example.com/ns");
+ balancer.startElement("http://example.com/ns", "wrap", "h:wrap",
noAtts());
+ // Emit content so the serializer can't collapse to a self-closing tag,
+ // forcing the close form to be explicit -- proves drainOpen used the
+ // matching qName ("h:wrap") rather than just the local name.
+ balancer.characters("x".toCharArray(), 0, 1);
+ balancer.drainOpenElements();
+ balancer.endPrefixMapping("h");
+ balancer.endDocument();
+
+ String xml = out.toString();
+ assertEquals(true, xml.contains("</h:wrap>"),
+ "expected qualified close </h:wrap>, got: " + xml);
+ }
+
+ @Test
+ public void drainIsIdempotent() throws Exception {
+ ToXMLContentHandler out = new ToXMLContentHandler();
+ XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(out);
+
+ balancer.startDocument();
+ balancer.startElement("", "p", "p", noAtts());
+ balancer.drainOpenElements();
+ balancer.drainOpenElements(); // second call: no-op
+ balancer.endDocument();
+ assertEquals(0, balancer.openElementCount());
+ }
+
+ @Test
+ public void downstreamValidatorStillCatchesMismatchedEndElement() throws
Exception {
+ // Balancer must NOT silently fix bad happy-path sequences -- the
+ // StrictXHTMLValidator wrapping the real handler must still see (and
+ // reject) excess endElement events.
+ ToXMLContentHandler out = new ToXMLContentHandler();
+ StrictXHTMLValidator validator = new StrictXHTMLValidator(out);
+ XHTMLBalancingHandler balancer = new XHTMLBalancingHandler(validator);
+
+ balancer.startDocument();
+ balancer.startElement("", "p", "p", noAtts());
+ balancer.endElement("", "p", "p");
+ // Extra endElement: stack is empty so balancer pops nothing, but the
+ // event still flows downstream to the validator, which must throw.
+ assertThrows(org.xml.sax.SAXException.class,
+ () -> balancer.endElement("", "p", "p"));
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
index 1329b8db06..2298983d1f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
@@ -59,7 +59,9 @@ class PagesContentHandler extends DefaultHandler {
@Override
public void endDocument() throws SAXException {
metadata.set(Office.PAGE_COUNT, String.valueOf(pageCount));
- if (pageCount > 0) {
+ // Either sf:page-start or sl:page-group opens a <div>; close the
+ // last open one regardless of which counter tracked it.
+ if (pageCount + slPageCount > 0) {
doFooter();
xhtml.endElement("div");
}
@@ -85,7 +87,17 @@ class PagesContentHandler extends DefaultHandler {
} else if ("sf:metadata".equals(qName)) {
inPart = DocumentPart.METADATA;
} else if ("sf:page-start".equals(qName) ||
"sl:page-group".equals(qName)) {
- if (pageCount > 0) {
+ // sf:p paragraphs can span page boundaries in iWork's XML schema.
+ // If a <p> is still open when the page changes, force-close it
+ // before the page <div> and reopen it in the new page so both
+ // pages have balanced tag pairs.
+ boolean reopenP = inPart == DocumentPart.PARSABLE_TEXT;
+ if (reopenP) {
+ xhtml.endElement("p");
+ }
+ // Use the combined counter so we close the prior <div> regardless
+ // of whether it was opened by sf:page-start or sl:page-group.
+ if (pageCount + slPageCount > 0) {
doFooter();
xhtml.endElement("div");
}
@@ -96,6 +108,9 @@ class PagesContentHandler extends DefaultHandler {
pageCount++;
}
doHeader();
+ if (reopenP) {
+ xhtml.startElement("p");
+ }
} else if ("sf:p".equals(qName)) {
if (pageCount + slPageCount > 0) {
inPart = DocumentPart.PARSABLE_TEXT;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index 676575ce40..5aeb992188 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -77,53 +77,60 @@ public class PRTParser implements Parser {
ParseContext context) throws IOException, SAXException,
TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata,
context);
- Last5 l5 = new Last5();
- int read;
+ xhtml.startDocument();
+ // try/finally so endDocument fires even if a header read, characters
+ // emit, or recursive helper throws
IOException/SAXException/TikaException
+ // mid-parse. Without this the captured XHTML would be left
unterminated.
+ try {
+ Last5 l5 = new Last5();
+ int read;
- // Try to get the creation date, which is YYYYMMDDhhmm
- byte[] header = new byte[30];
- IOUtils.readFully(tis, header);
- byte[] date = new byte[12];
- IOUtils.readFully(tis, date);
+ // Try to get the creation date, which is YYYYMMDDhhmm
+ byte[] header = new byte[30];
+ IOUtils.readFully(tis, header);
+ byte[] date = new byte[12];
+ IOUtils.readFully(tis, date);
- String dateStr = new String(date, US_ASCII);
- if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
- String formattedDate = dateStr.substring(0, 4) + "-" +
dateStr.substring(4, 6) + "-" +
- dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) +
":" +
- dateStr.substring(10, 12) + ":00";
- metadata.set(TikaCoreProperties.CREATED, formattedDate);
- // TODO Metadata.DATE is used as modified, should it be here?
- metadata.set(TikaCoreProperties.CREATED, formattedDate);
- }
- metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+ String dateStr = new String(date, US_ASCII);
+ if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
+ String formattedDate = dateStr.substring(0, 4) + "-" +
dateStr.substring(4, 6) + "-" +
+ dateStr.substring(6, 8) + "T" + dateStr.substring(8,
10) + ":" +
+ dateStr.substring(10, 12) + ":00";
+ metadata.set(TikaCoreProperties.CREATED, formattedDate);
+ metadata.set(TikaCoreProperties.MODIFIED, formattedDate);
+ }
+ metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
- // The description, if set, is the next up-to-500 bytes
- byte[] desc = new byte[500];
- IOUtils.readFully(tis, desc);
- String description = extractText(desc, true);
- if (description.length() > 0) {
- metadata.set(TikaCoreProperties.DESCRIPTION, description);
- }
+ // The description, if set, is the next up-to-500 bytes
+ byte[] desc = new byte[500];
+ IOUtils.readFully(tis, desc);
+ String description = extractText(desc, true);
+ if (description.length() > 0) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, description);
+ }
- // Now look for text
- while ((read = tis.read()) > -1) {
- if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
- int nread = tis.read();
- if (nread == 0x3f || nread == 0xbf) {
- // Looks promising, check back for a suitable value
- if (read == 0xe3 && nread == 0x3f) {
- if (l5.is33()) {
- // Bingo, note text
- handleNoteText(tis, xhtml);
+ // Now look for text
+ while ((read = tis.read()) > -1) {
+ if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
+ int nread = tis.read();
+ if (nread == 0x3f || nread == 0xbf) {
+ // Looks promising, check back for a suitable value
+ if (read == 0xe3 && nread == 0x3f) {
+ if (l5.is33()) {
+ // Bingo, note text
+ handleNoteText(tis, xhtml);
+ }
+ } else if (l5.is00()) {
+ // Likely view name
+ handleViewName(read, nread, tis, xhtml, l5);
}
- } else if (l5.is00()) {
- // Likely view name
- handleViewName(read, nread, tis, xhtml, l5);
}
+ } else {
+ l5.record(read);
}
- } else {
- l5.record(read);
}
+ } finally {
+ xhtml.endDocument();
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index e5ef13da29..27c0b92efb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -165,6 +165,21 @@ public class SourceCodeParser extends
AbstractEncodingDetectorParser {
}
private static class TikaNodeFilter implements NodeFilter {
+
+ // JHighlight wraps its highlighted code in a full HTML document. Of
+ // those wrappers:
+ // - html/body are dropped as elements but their descendants are
+ // still emitted (that's where the highlighted code lives).
+ // - head/meta/title/link/style are dropped *with* their subtrees,
+ // otherwise their text (CSS, the filename, etc.) leaks into the
+ // outer XHTML as bare character data.
+ // All of these collide with Tika's outer XHTMLContentHandler when
+ // emitted, producing malformed XHTML downstream.
+ private static final Set<String> WRAPPER_TAGS_DROP_ELEMENT =
+ Set.of("html", "body");
+ private static final Set<String> WRAPPER_TAGS_DROP_SUBTREE =
+ Set.of("head", "meta", "title", "link", "style");
+
boolean ignore = true;
ContentHandler handler;
@@ -181,6 +196,12 @@ public class SourceCodeParser extends
AbstractEncodingDetectorParser {
if (ignore) {
return FilterResult.CONTINUE;
}
+ if (WRAPPER_TAGS_DROP_SUBTREE.contains(node.nodeName())) {
+ return FilterResult.SKIP_ENTIRELY;
+ }
+ if (WRAPPER_TAGS_DROP_ELEMENT.contains(node.nodeName())) {
+ return FilterResult.CONTINUE;
+ }
if (node instanceof TextNode) {
String txt = ((TextNode) node).getWholeText();
if (txt != null) {
@@ -234,6 +255,10 @@ public class SourceCodeParser extends
AbstractEncodingDetectorParser {
if (ignore) {
return FilterResult.CONTINUE;
}
+ if (WRAPPER_TAGS_DROP_ELEMENT.contains(node.nodeName())
+ || WRAPPER_TAGS_DROP_SUBTREE.contains(node.nodeName())) {
+ return FilterResult.CONTINUE;
+ }
if (node instanceof TextNode || node instanceof DataNode) {
return NodeFilter.FilterResult.CONTINUE;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index 36948b71f8..03023cc0ed 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -56,7 +56,6 @@ public class SourceCodeParserTest extends TikaTest {
createMetadata("text/x-java-source")).xml;
assertTrue(htmlContent.indexOf("<html xmlns=\"http") == 0);
- assertContains("xml:lang=\"en\"", htmlContent);
assertTrue(htmlContent.indexOf(
"<span class=\"java_keyword\">public</span><span
class=\"java_plain\">") >
0);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 900853f2e0..5ec9bfe954 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -63,6 +63,7 @@ import
org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.SummaryExtractor;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLBalancingHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.StringUtils;
@@ -701,11 +702,23 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
if (relatedPartPart == null) {
continue;
}
+ // Wrap the contentHandler so we can close anything the
+ // inner parser left open if it throws mid-element. Without
+ // this, the </div> emitted after the loop would land on
+ // top of an open <p>/<td>/etc. from the failed sub-parse.
+ XHTMLBalancingHandler balancer =
+ new XHTMLBalancingHandler(contentHandler);
try (InputStream stream =
relatedPartPart.getInputStream()) {
XMLReaderUtils.parseSAX(stream,
- new EmbeddedContentHandler(contentHandler),
context);
+ new EmbeddedContentHandler(balancer), context);
} catch (IOException | TikaException e) {
+ balancer.drainOpenElements();
+
parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ } catch (SAXException e) {
+ balancer.drainOpenElements();
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index a18f52a4d2..1e11e80b78 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -64,6 +64,14 @@ public class OOXMLTikaBodyPartHandler
//<tc><p/><p/><table><tr><tc></p></p></tc></tr></table>...
private int tableCellDepth = 0;
private int pWithinCell = 0;
+ // Stack of structural elements (paragraphs, tables, rows, cells) this
+ // handler has emitted to the xhtml stream and not yet closed. Used by
+ // closeAnyPending() to drain the stack in reverse order so the captured
+ // XHTML stays balanced when a caller's parseSAX call throws part-way.
+ // Tags emitted by FormattingTagManager (<b>/<i>/<u>/<s>/<a>) are not
+ // tracked here -- closeAnyPending closes them via
formattingTags.closeAll()
+ // before draining this stack.
+ private final java.util.Deque<String> openStructuralTags = new
java.util.ArrayDeque<>();
//will need to replace this with a stack
//if we're marking more that the first level <p/> element
@@ -163,6 +171,7 @@ public class OOXMLTikaBodyPartHandler
} else {
xhtml.startElement(paragraphTag, "class", styleClass);
}
+ openStructuralTags.push(paragraphTag);
}
writeParagraphNumber(paragraphProperties.getNumId(),
paragraphProperties.getIlvl(),
@@ -176,6 +185,7 @@ public class OOXMLTikaBodyPartHandler
formattingTags.closeAll();
if (pDepth == 1 && tableDepth == 0) {
xhtml.endElement(paragraphTag);
+ popExpected(paragraphTag);
} else if (tableCellDepth > 0 && pWithinCell > 0) {
xhtml.characters(NEWLINE, 0, 1);
} else if (tableCellDepth == 0) {
@@ -214,10 +224,50 @@ public class OOXMLTikaBodyPartHandler
return emittedCommentIds;
}
+ /**
+ * Closes any XHTML elements this handler opened but didn't get a chance to
+ * close, in the proper nesting order. Intended ONLY for the catch arm of a
+ * caller that swallowed a {@link SAXException} from the inner SAX parser;
+ * the normal happy-path flow keeps the trackers in sync via endParagraph
+ * / endTableCell / endTableRow / endTable / FormattingTagManager.closeAll.
+ * Without this, swallowed exceptions leave dangling {@code <p>}, {@code
<td>},
+ * {@code <tr>}, {@code <table>}, or formatting tags on the wire that
+ * collide with the outer {@code </body></html>}.
+ */
+ public void closeAnyPending() throws SAXException {
+ formattingTags.closeAll();
+ // Drain the structural-element stack in reverse open order. This
+ // handles nested tables correctly (multiple cells/rows/tables
+ // interleaved), unlike per-element counters which lose nesting info.
+ while (!openStructuralTags.isEmpty()) {
+ String tag = openStructuralTags.pop();
+ xhtml.endElement(tag);
+ }
+ // Reset internal depth/state so subsequent emits start clean.
+ tableDepth = 0;
+ tableCellDepth = 0;
+ pDepth = 0;
+ pWithinCell = 0;
+ }
+
+ /**
+ * Pops {@code openStructuralTags} expecting the given tag on top.
+ * If the stack is empty or the top differs, this is a no-op rather than a
+ * throw -- the stack is best-effort tracking for closeAnyPending(), and
+ * the existing happy-path tests (which don't trigger closeAnyPending) must
+ * not be perturbed by stack tracking bugs.
+ */
+ private void popExpected(String tag) {
+ if (!openStructuralTags.isEmpty() &&
tag.equals(openStructuralTags.peek())) {
+ openStructuralTags.pop();
+ }
+ }
+
@Override
public void startTable() throws SAXException {
xhtml.startElement("table");
+ openStructuralTags.push("table");
tableDepth++;
}
@@ -226,6 +276,7 @@ public class OOXMLTikaBodyPartHandler
public void endTable() throws SAXException {
xhtml.endElement("table");
+ popExpected("table");
tableDepth--;
}
@@ -233,22 +284,26 @@ public class OOXMLTikaBodyPartHandler
@Override
public void startTableRow() throws SAXException {
xhtml.startElement("tr");
+ openStructuralTags.push("tr");
}
@Override
public void endTableRow() throws SAXException {
xhtml.endElement("tr");
+ popExpected("tr");
}
@Override
public void startTableCell() throws SAXException {
xhtml.startElement("td");
+ openStructuralTags.push("td");
tableCellDepth++;
}
@Override
public void endTableCell() throws SAXException {
xhtml.endElement("td");
+ popExpected("td");
pWithinCell = 0;
tableCellDepth--;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index a93bfd7d08..0fb05407a5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -271,6 +271,13 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
runBuffer.append(TAB_CHAR);
} else if (P.equals(localName)) {
lastStartElementWasP = true;
+ // Each <w:p> needs its own pStarted lifecycle. Without this,
+ // a nested <w:p> (e.g., inside <wps:txbx>/<w:txbxContent>) would
+ // inherit the outer paragraph's pStarted=true, suppress its own
+ // startParagraph in the </w:pPr> branch, then fire its
+ // endParagraph on </w:p> -- producing an unbalanced start/end
+ // count that desyncs the XHTML <p>/<p> stream.
+ pStarted = false;
} else if (B.equals(localName)) { //TODO: add bCs
if (inR && inRPr) {
currRunProperties.setBold(true);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index afa01a2339..9ffda5fd03 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -38,6 +38,7 @@ import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PageAnchoring;
@@ -202,9 +203,10 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
int hidden = 0;
xhtml.startElement("div", "class", "slide-content");
+ OOXMLTikaBodyPartHandler bodyHandler = new
OOXMLTikaBodyPartHandler(xhtml, metadata);
try (InputStream stream = slidePart.getInputStream()) {
- OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = new
OOXMLWordAndPowerPointTextHandler(
- new OOXMLTikaBodyPartHandler(xhtml, metadata),
linkedRelationships);
+ OOXMLWordAndPowerPointTextHandler wordAndPPTHandler =
+ new OOXMLWordAndPowerPointTextHandler(bodyHandler,
linkedRelationships);
XMLReaderUtils.parseSAX(stream,
new EmbeddedContentHandler(wordAndPPTHandler), context);
if (wordAndPPTHandler.isHiddenSlide()) {
@@ -214,9 +216,19 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
if (wordAndPPTHandler.hasAnimations()) {
metadata.set(Office.HAS_ANIMATIONS, true);
}
+ } catch (SAXException e) {
+ // Truncated/malformed slide XML can leave the body handler with
+ // unclosed <p>, <td>, etc. on the wire. Close them before the
+ // </div> below so subsequent slides -- and the outer </body> --
+ // land in a balanced spot.
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ bodyHandler.closeAnyPending();
} catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
+ bodyHandler.closeAnyPending();
}
xhtml.endElement("div");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index de3ff51835..9da2352c19 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -381,9 +381,15 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
WriteLimitReachedException.throwIfWriteLimitReached(e);
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
+ // The partial parse may have left <p>, <td>, <tr>, <table>, or
+ // formatting tags open on the XHTML stream. Close them now so
+ // subsequent parts -- and the outer </body></html> -- land in a
+ // balanced spot.
+ bodyHandler.closeAnyPending();
} catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
+ bodyHandler.closeAnyPending();
}
Map<String, EmbeddedPartMetadata> partMetadata =
bodyHandler.getEmbeddedPartMetadataMap();
resolveEmfNames(packagePart, partMetadata);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index feb18e0da3..dae608b7ac 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -235,6 +235,10 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
WriteLimitReachedException.throwIfWriteLimitReached(e);
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
+ // Balance any <tr>/<td> left open by the partial parse so
+ // the </tbody></table></div> emitted below land in the
+ // right place.
+ sheetExtractor.closeAnyPending();
}
try {
getThreadedComments(container, sheetPart, xhtml);
@@ -991,6 +995,12 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
private XHTMLContentHandler xhtml;
private int lastSeenRow = -1;
private int lastSeenCol = -1;
+ // Track open <tr>/<td> so the outer catch can emit balanced closes
+ // when processSheet throws part-way through a row (e.g., a malformed
+ // sheet XML). Without this, the outer code would emit </tbody></table>
+ // while <tr> (or <td>) was still on the stack, producing malformed
XHTML.
+ private boolean rowOpen;
+ private boolean cellOpen;
protected SheetTextAsHTML(OfficeParserConfig config,
XHTMLContentHandler xhtml) {
this.includeHeadersFooters = config.isIncludeHeadersAndFooters();
@@ -1006,14 +1016,19 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
if (includeMissingRows && rowNum > (lastSeenRow + 1)) {
for (int rn = lastSeenRow + 1; rn < rowNum; rn++) {
xhtml.startElement("tr");
+ rowOpen = true;
xhtml.startElement("td");
+ cellOpen = true;
xhtml.endElement("td");
+ cellOpen = false;
xhtml.endElement("tr");
+ rowOpen = false;
}
}
// Start the new row
xhtml.startElement("tr");
+ rowOpen = true;
lastSeenCol = -1;
} catch (SAXException e) {
//swallow
@@ -1025,11 +1040,28 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
public void endRow(int rowNum) {
try {
xhtml.endElement("tr");
+ rowOpen = false;
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
}
+ /**
+ * Closes any pending {@code <tr>} or {@code <td>} that was opened
+ * before a {@link SAXException} interrupted sheet processing. Safe to
+ * call when nothing is open.
+ */
+ void closeAnyPending() throws SAXException {
+ if (cellOpen) {
+ xhtml.endElement("td");
+ cellOpen = false;
+ }
+ if (rowOpen) {
+ xhtml.endElement("tr");
+ rowOpen = false;
+ }
+ }
+
public void cell(String cellRef, String formattedValue,
XSSFCommentsShim.CommentData comment) {
try {
@@ -1038,12 +1070,15 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
(cellRef == null) ? lastSeenCol + 1 : (new
CellReference(cellRef)).getCol();
for (int cn = lastSeenCol + 1; cn < colNum; cn++) {
xhtml.startElement("td");
+ cellOpen = true;
xhtml.endElement("td");
+ cellOpen = false;
}
lastSeenCol = colNum;
// Start this cell
xhtml.startElement("td");
+ cellOpen = true;
// Main cell contents
if (formattedValue != null) {
@@ -1060,6 +1095,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
xhtml.endElement("td");
+ cellOpen = false;
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 4460946c52..9d2e2239be 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -59,8 +59,8 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.xml.DcXMLParser;
import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLBalancingHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
@@ -114,11 +114,12 @@ public class EpubParser implements Parser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata,
context);
xhtml.startDocument();
IOException caughtException = null;
- ContentHandler childHandler = new EmbeddedContentHandler(
- new EpubNormalizingHandler(new BodyContentHandler(xhtml)));
+ EpubNormalizingHandler normalizer =
+ new EpubNormalizingHandler(new BodyContentHandler(xhtml));
+ ContentHandler childHandler = new EmbeddedContentHandler(normalizer);
Set<String> encryptedItems = Collections.EMPTY_SET;
try {
- encryptedItems = bufferedParse(tis, childHandler, xhtml, metadata,
context);
+ encryptedItems = bufferedParse(tis, childHandler, normalizer,
xhtml, metadata, context);
} catch (IOException e) {
caughtException = e;
}
@@ -141,19 +142,22 @@ public class EpubParser implements Parser {
}
private Set<String> bufferedParse(TikaInputStream tis, ContentHandler
bodyHandler,
+ EpubNormalizingHandler normalizer,
XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context)
throws IOException, TikaException, SAXException {
// DefaultZipContainerDetector opens (and salvages, if needed) the
ZipFile and
// stashes it on the TikaInputStream. Reuse it when present; otherwise
open ourselves.
if (tis.getOpenContainer() instanceof ZipFile) {
- return bufferedParseZipFile((ZipFile) tis.getOpenContainer(),
bodyHandler, xhtml, metadata, context);
+ return bufferedParseZipFile((ZipFile) tis.getOpenContainer(),
bodyHandler,
+ normalizer, xhtml, metadata, context);
}
try (ZipFile zipFile =
ZipFile.builder().setFile(tis.getPath().toFile()).get()) {
- return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata,
context);
+ return bufferedParseZipFile(zipFile, bodyHandler, normalizer,
xhtml, metadata, context);
}
}
private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler
bodyHandler,
+ EpubNormalizingHandler normalizer,
XHTMLContentHandler xhtml, Metadata
metadata,
ParseContext context)
throws IOException, TikaException, SAXException {
@@ -167,7 +171,7 @@ public class EpubParser implements Parser {
// emit partial content (matching 3.x's streamingParse contract),
// then throw to signal the result is incomplete.
LOG.trace("epub fallback: rootOPF=null, streaming all html
entries");
- return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata,
context,
+ return fallbackParseAllHtmlEntries(zipFile, bodyHandler,
normalizer, metadata, context,
"no OPF found in (possibly truncated) container");
}
ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
@@ -175,7 +179,7 @@ public class EpubParser implements Parser {
zae, zae == null ? "n/a" : zipFile.canReadEntryData(zae));
if (zae == null || !zipFile.canReadEntryData(zae)) {
LOG.trace("epub fallback: OPF entry missing/unreadable, streaming
all html entries");
- return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata,
context,
+ return fallbackParseAllHtmlEntries(zipFile, bodyHandler,
normalizer, metadata, context,
"OPF entry missing or unreadable in (possibly truncated)
container");
}
try (TikaInputStream tis =
TikaInputStream.get(zipFile.getInputStream(zae))) {
@@ -191,7 +195,7 @@ public class EpubParser implements Parser {
contentOrderScraper.locationMap.size());
if (contentOrderScraper.contentItems.isEmpty()) {
LOG.trace("epub fallback: empty spine, streaming all html
entries");
- return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata,
context,
+ return fallbackParseAllHtmlEntries(zipFile, bodyHandler,
normalizer, metadata, context,
"OPF declared no spine items in (possibly truncated)
container");
}
String relativePath = "";
@@ -236,6 +240,12 @@ public class EpubParser implements Parser {
throw e;
}
saxExceptions.add(e);
+ // The aborted spine item may have left <svg>,
+ // <g>, <p>, etc. open on the wire. Close them
+ // before the next item (or the outer </body>)
+ // emits, otherwise the validator sees cross-
+ // nested events.
+ normalizer.drainOpenElements();
} catch (IOException ioe) {
LOG.trace("epub spine read IOException on {}: {}",
path, ioe.toString());
throw ioe;
@@ -297,6 +307,7 @@ public class EpubParser implements Parser {
*/
private Set<String> fallbackParseAllHtmlEntries(ZipFile zipFile,
ContentHandler bodyHandler,
+ EpubNormalizingHandler
normalizer,
Metadata metadata,
ParseContext context,
String reason)
@@ -335,6 +346,8 @@ public class EpubParser implements Parser {
}
failed++;
LOG.trace("epub fallback: SAX failure on {}: {}",
entry.getName(), e.toString());
+ // Close any tags the aborted parse left open.
+ normalizer.drainOpenElements();
} catch (IOException e) {
failed++;
LOG.trace("epub fallback: IO failure on {}: {}",
entry.getName(), e.toString());
@@ -588,7 +601,8 @@ public class EpubParser implements Parser {
//for now, this simply converts all names to local names to avoid
//namespace conflicts in the content handler. This also removes namespaces
//from attributes
- private static class EpubNormalizingHandler extends
ContentHandlerDecorator {
+ private static class EpubNormalizingHandler extends XHTMLBalancingHandler {
+
public EpubNormalizingHandler(ContentHandler contentHandler) {
super(contentHandler);
}
@@ -607,7 +621,15 @@ public class EpubParser implements Parser {
if (needToRewrite) {
AttributesImpl simplifiedAtts = new AttributesImpl();
for (int i = 0; i < atts.getLength(); i++) {
- simplifiedAtts.addAttribute("", atts.getLocalName(i),
atts.getLocalName(i),
+ String localAttName = atts.getLocalName(i);
+ // Stripping the namespace prefix can collapse two distinct
+ // qnames onto one local name (e.g. xml:lang + lang). The
+ // serialized XHTML must have unique attribute names, so
+ // keep the first occurrence and drop later duplicates.
+ if (simplifiedAtts.getIndex("", localAttName) >= 0) {
+ continue;
+ }
+ simplifiedAtts.addAttribute("", localAttName, localAttName,
atts.getType(i), atts.getValue(i));
}
super.startElement(uri, localName, localName, simplifiedAtts);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index b001f09f85..6bbc7eaa86 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -150,6 +150,12 @@ class OpenDocumentBodyHandler extends
ElementMappingContentHandler {
private boolean curBold;
private boolean curItalic;
private int pDepth = 0;
+ // Nesting depth of <a> (text:a / draw:a) elements. While > 0, style tag
+ // emission is suppressed so b/i/u never close across an open <a> (which
+ // would produce cross-nested XHTML like <a><b>...</a></b>). The tradeoff
+ // is that bold/italic/underline runs that begin or end inside a link lose
+ // their formatting in the captured XHTML.
+ private int anchorDepth = 0;
OpenDocumentBodyHandler(ContentHandler handler, ParseContext parseContext)
{
super(handler, MAPPINGS);
this.handler = handler;
@@ -275,6 +281,11 @@ class OpenDocumentBodyHandler extends
ElementMappingContentHandler {
private void updateStyleTags() throws SAXException {
+ // Suppress style-tag flips while inside a <a> -- emitting
</b>/</i>/</u>
+ // here would cross-nest with the open anchor and produce malformed
XHTML.
+ if (anchorDepth > 0) {
+ return;
+ }
if (currTextStyle == null) {
closeStyleTags();
return;
@@ -325,7 +336,39 @@ class OpenDocumentBodyHandler extends
ElementMappingContentHandler {
updateStyleTags();
}
+ /**
+ * Returns true for ODF elements that map to block-level XHTML and so
+ * shouldn't sit inside open inline-style tags. When such an element opens
+ * while {@code <b>/<i>/<u>} are on the SAX stack, the inline tags would
+ * trap the new block element underneath them; subsequent style flips
+ * inside the block would emit close events that don't match the topmost
+ * open element. The startElement handler closes pending style tags
+ * before opening any of these.
+ * <p>
+ * text:p / text:h / text:list / annotation / note / notes / a are handled
+ * by their own branches in startElement and never reach the default
+ * branch where this check is used.
+ */
+ private static boolean isBlockLevelOpen(String uri, String localName) {
+ if (DRAW_NS.equals(uri) && "text-box".equals(localName)) {
+ return true;
+ }
+ if (TABLE_NS.equals(uri) &&
+ ("table".equals(localName) || "table-row".equals(localName)
+ || "table-cell".equals(localName))) {
+ return true;
+ }
+ return TEXT_NS.equals(uri) && "list-item".equals(localName);
+ }
+
private void closeStyleTags() throws SAXException {
+ // Same reasoning as in updateStyleTags: never emit style-tag closes
+ // while a <a> is on the stack -- the </b>/</i>/</u> would land in the
+ // wrong place and the strict validator (or any SAX parser) would
reject
+ // the resulting XHTML.
+ if (anchorDepth > 0) {
+ return;
+ }
// Close any still open style tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
@@ -437,6 +480,22 @@ class OpenDocumentBodyHandler extends
ElementMappingContentHandler {
} else if ("notes".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "p", "p", NOTES_ATTRIBUTES);
+ } else if ("a".equals(localName)) {
+ // Suppress any pending style flip before opening the anchor,
+ // and bump the depth so style emission stays quiet while we're
+ // inside it. See updateStyleTags / closeStyleTags.
+ anchorDepth++;
+ super.startElement(namespaceURI, localName, qName, attrs);
+ } else if (isBlockLevelOpen(namespaceURI, localName)) {
+ // Block-level structural elements (draw:text-box -> <div>,
+ // table:table -> <table>, etc.) opened while <b>/<i>/<u> are
+ // on top would trap those inline tags. Subsequent style flips
+ // inside would emit </b> while the block is on top, producing
+ // cross-nested XHTML. Close pending styles before opening the
+ // block; if there's still text to emit at the same style after
+ // the block closes, updateStyleTags() will reopen them.
+ closeStyleTags();
+ super.startElement(namespaceURI, localName, qName, attrs);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
@@ -489,6 +548,24 @@ class OpenDocumentBodyHandler extends
ElementMappingContentHandler {
closeStyleTags();
handler.endElement(namespaceURI, "p", "p");
} else if ("a".equals(localName)) {
+ // closeStyleTags is a no-op while anchorDepth > 0, but we keep
+ // the call so any future change to its semantics still runs
+ // here. Decrement only after </a> has been emitted so the
+ // suppression covers the close itself.
+ closeStyleTags();
+ super.endElement(namespaceURI, localName, qName);
+ if (anchorDepth > 0) {
+ anchorDepth--;
+ }
+ } else if (SVG_NS.equals(namespaceURI)
+ && ("title".equals(localName) ||
"desc".equals(localName))) {
+ // svg:title / svg:desc map through MAPPINGS to <span>. If a
+ // <b>/<i>/<u> opened inside (via text:span style) is still
+ // on top when </span> fires, we'd cross-nest. Close pending
+ // style tags before the parent close. Other text:span lazy-
+ // close paths (via characters() -> updateStyleTags) are
+ // unaffected because this branch only triggers on these
+ // specific elements.
closeStyleTags();
super.endElement(namespaceURI, localName, qName);
} else {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index ec68f8c2bd..a361521021 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -223,6 +223,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
attributes.addAttribute("", name, name, "CDATA", value);
}
+ private static void setOrReplaceAttribute(String name, String value,
+ AttributesImpl attributes) {
+ if (name == null || value == null) {
+ return;
+ }
+ int idx = attributes.getIndex("", name);
+ if (idx >= 0) {
+ attributes.setValue(idx, value);
+ } else {
+ attributes.addAttribute("", name, name, "CDATA", value);
+ }
+ }
+
private static PDActionURI getActionURI(PDAnnotation annot) {
//copied and pasted from PDFBox's PrintURLs
@@ -408,8 +421,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
if (spec instanceof PDSimpleFileSpecification) {
//((PDSimpleFileSpecification)spec).getFile();
- attributes.addAttribute("", "class", "class", "CDATA", "linked");
- attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
+ setOrReplaceAttribute("class", "linked", attributes);
+ setOrReplaceAttribute("id", spec.getFile(), attributes);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
} else if (spec instanceof PDComplexFileSpecification) {
@@ -497,8 +510,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
}
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", fileName);
+ setOrReplaceAttribute("class", "embedded", attributes);
+ setOrReplaceAttribute("id", fileName, attributes);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
@@ -1093,8 +1106,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m,
context, true);
}
};
- addNonNullAttribute("class", "javascript", attrs);
- addNonNullAttribute("type", jsAction.getType(), attrs);
+ setOrReplaceAttribute("class", "javascript", attrs);
+ setOrReplaceAttribute("type", jsAction.getType(), attrs);
addNonNullAttribute("subtype", jsAction.getSubType(), attrs);
xhtml.startElement("div", attrs);
xhtml.endElement("div");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6ce771c6f9..e73471e353 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1537,6 +1537,20 @@ public class PDFParserTest extends TikaTest {
config.setMaxPages(1);
}
+ // TIKA-4728: handleDestinationOrAction pre-populated class/type on the
action div,
+ // then processJavaScriptAction appended a second class/type for
PDActionJavaScript
+ // actions, producing a div with duplicate attributes that SAX parsers
reject.
+ // TikaTest.getXML wraps with StrictXHTMLValidator, so a regression makes
+ // this test throw at the offending SAX event.
+ @Test
+ public void testExtractActionsXHTMLWellFormed() throws Exception {
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractActions(true);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+ getXML("testPDF_jsActionOnPage.pdf", context);
+ }
+
/**
@Test
public void testWriteLimit() throws Exception {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf
new file mode 100644
index 0000000000..5b283307f8
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_jsActionOnPage.pdf
@@ -0,0 +1,26 @@
+%PDF-1.4
+%����
+1 0 obj
+<< /Type /Catalog /Pages 2 0 R >>
+endobj
+2 0 obj
+<< /Type /Pages /Kids [3 0 R] /Count 1 >>
+endobj
+3 0 obj
+<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /AA << /O 4 0 R >> >>
+endobj
+4 0 obj
+<< /Type /Action /S /JavaScript /JS (app.alert\('TIKA test'\);) >>
+endobj
+xref
+0 5
+0000000000 65535 f
+0000000015 00000 n
+0000000064 00000 n
+0000000121 00000 n
+0000000211 00000 n
+trailer
+<< /Size 5 /Root 1 0 R >>
+startxref
+293
+%%EOF
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index 4244d62280..f2c38f896f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -97,17 +97,24 @@ public class TXTParser extends
AbstractEncodingDetectorParser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata, context);
xhtml.startDocument();
-
- xhtml.startElement("p");
- char[] buffer = new char[4096];
- int n = reader.read(buffer);
- while (n != -1) {
- xhtml.characters(buffer, 0, n);
- n = reader.read(buffer);
+ // try/finally so any upstream exception (e.g., WriteLimitReached)
+ // still emits </p> and </body></html> instead of leaving the
+ // captured XHTML unterminated.
+ try {
+ xhtml.startElement("p");
+ try {
+ char[] buffer = new char[4096];
+ int n = reader.read(buffer);
+ while (n != -1) {
+ xhtml.characters(buffer, 0, n);
+ n = reader.read(buffer);
+ }
+ } finally {
+ xhtml.endElement("p");
+ }
+ } finally {
+ xhtml.endDocument();
}
- xhtml.endElement("p");
-
- xhtml.endDocument();
} finally {
tis.removeCloseShield();
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java
index 2a1224171b..e918c8977b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/tmx/TMXParser.java
@@ -66,11 +66,13 @@ public class TMXParser implements Parser {
metadata.set(Metadata.CONTENT_TYPE, TMX_CONTENT_TYPE.toString());
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata, context);
+ xhtml.startDocument();
tis.setCloseShield();
try {
XMLReaderUtils.parseSAX(tis, new TMXContentHandler(xhtml,
metadata), context);
} finally {
tis.removeCloseShield();
+ xhtml.endDocument();
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
index 476388370d..c800561f43 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
@@ -66,12 +66,13 @@ public class XLIFF12Parser implements Parser {
metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString());
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata, context);
-
+ xhtml.startDocument();
tis.setCloseShield();
try {
XMLReaderUtils.parseSAX(tis, new XLIFF12ContentHandler(xhtml,
metadata), context);
} finally {
tis.removeCloseShield();
+ xhtml.endDocument();
}
}