This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4636-simplify-embedded-extractor-handling in repository https://gitbox.apache.org/repos/asf/tika.git
commit d64041cf2dce541e4560163b0a87eb6eb369caa0 Merge: c6d5f49faf 48ca355225 Author: tallison <[email protected]> AuthorDate: Thu Jan 29 06:54:49 2026 -0500 Merge origin/main into simplify-embedded-extractor-handling .../ROOT/pages/advanced/setting-limits.adoc | 356 ++++++++++++++++----- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 27 +- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 8 +- .../test/resources/configs/config-template.json | 1 - tika-core/src/main/java/org/apache/tika/Tika.java | 26 +- .../org/apache/tika/config/EmbeddedLimits.java | 222 +++++++++++++ .../java/org/apache/tika/config/OutputLimits.java | 269 ++++++++++++++++ .../java/org/apache/tika/config/TimeoutLimits.java | 135 ++++++++ .../exception/EmbeddedLimitReachedException.java | 62 ++++ .../tika/extractor/EmbeddedDocumentExtractor.java | 9 + .../tika/extractor/ParserContainerExtractor.java | 4 +- .../ParsingEmbeddedDocumentExtractor.java | 49 ++- .../java/org/apache/tika/metadata/Metadata.java | 24 ++ .../apache/tika/metadata/TikaCoreProperties.java | 7 + .../writefilter/MetadataWriteLimiterFactory.java | 2 +- .../org/apache/tika/parser/AutoDetectParser.java | 27 +- .../apache/tika/parser/AutoDetectParserConfig.java | 85 +---- .../org/apache/tika/parser/CompositeParser.java | 8 +- .../java/org/apache/tika/parser/ParseContext.java | 6 +- .../java/org/apache/tika/parser/ParseRecord.java | 80 +++-- .../java/org/apache/tika/parser/ParsingReader.java | 4 +- .../apache/tika/parser/RecursiveParserWrapper.java | 16 - .../sax/AbstractRecursiveParserWrapperHandler.java | 24 +- .../tika/sax/BasicContentHandlerFactory.java | 49 +-- .../tika/sax/RecursiveParserWrapperHandler.java | 20 +- .../org/apache/tika/sax/SecureContentHandler.java | 24 ++ .../apache/tika/sax/WriteOutContentHandler.java | 18 ++ .../org/apache/tika/MultiThreadedTikaTest.java | 3 +- .../org/apache/tika/example/ParsingExample.java | 2 +- .../src/test/resources/kafka/plugins-template.json | 1 - .../resources/opensearch/plugins-template.json | 1 - .../opensearch/tika-config-opensearch.json | 1 - .../src/test/resources/s3/plugins-template.json | 1 - .../src/test/resources/solr/plugins-template.json | 1 - .../apache/tika/parser/journal/TEIDOMParser.java | 2 +- .../tika/parser/apple/AppleSingleFileParser.java | 2 +- .../org/apache/tika/parser/apple/PListParser.java | 10 +- .../parser/iwork/iwana/IWork13PackageParser.java | 6 +- .../executable/UniversalExecutableParser.java | 2 +- .../org/apache/tika/parser/crypto/Pkcs7Parser.java | 2 +- .../org/apache/tika/parser/crypto/TSDParser.java | 4 +- .../org/apache/tika/parser/html/HtmlHandler.java | 8 +- .../apache/tika/parser/jdbc/JDBCTableReader.java | 4 +- .../tika/parser/mail/MailContentHandler.java | 4 +- .../org/apache/tika/parser/mbox/MboxParser.java | 2 +- .../parser/microsoft/AbstractPOIFSExtractor.java | 12 +- .../apache/tika/parser/microsoft/EMFParser.java | 4 +- .../tika/parser/microsoft/HSLFExtractor.java | 4 +- .../tika/parser/microsoft/JackcessExtractor.java | 2 +- .../apache/tika/parser/microsoft/OfficeParser.java | 6 +- .../tika/parser/microsoft/OutlookExtractor.java | 12 +- .../apache/tika/parser/microsoft/TNEFParser.java | 2 +- .../tika/parser/microsoft/chm/ChmParser.java | 2 +- .../tika/parser/microsoft/libpst/EmailVisitor.java | 2 +- .../microsoft/onenote/OneNoteTreeWalker.java | 2 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 10 +- .../ooxml/XWPFWordExtractorDecorator.java | 2 +- .../microsoft/ooxml/xps/XPSPageContentHandler.java | 2 +- .../ooxml/xwpf/ml2006/BinaryDataHandler.java | 2 +- .../parser/microsoft/pst/OutlookPSTParser.java | 2 +- .../parser/microsoft/pst/PSTMailItemParser.java | 6 +- .../parser/microsoft/rtf/RTFEmbObjHandler.java | 6 +- .../tika/parser/microsoft/xml/WordMLParser.java | 2 +- .../org/apache/tika/parser/epub/EpubParser.java | 4 +- .../apache/tika/parser/indesign/IDMLParser.java | 6 +- .../parser/odf/FlatOpenDocumentMacroHandler.java | 7 +- .../tika/parser/odf/OpenDocumentBodyHandler.java | 7 +- .../tika/parser/odf/OpenDocumentMacroHandler.java | 3 +- .../apache/tika/parser/odf/OpenDocumentParser.java | 4 +- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 12 +- .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +- .../tika/parser/pdf/image/ImageGraphicsEngine.java | 2 +- .../tika/renderer/pdf/mutool/MuPDFRenderer.java | 2 +- .../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 2 +- .../apache/tika/parser/pkg/CompressorParser.java | 2 +- .../org/apache/tika/parser/pkg/PackageParser.java | 2 +- .../org/apache/tika/parser/pkg/UnrarParser.java | 2 +- .../org/apache/tika/parser/http/HttpParser.java | 2 +- .../org/apache/tika/parser/wacz/WACZParser.java | 2 +- .../org/apache/tika/parser/warc/WARCParser.java | 2 +- .../apache/tika/parser/xml/FictionBookParser.java | 2 +- .../tika/parser/AutoDetectParserConfigTest.java | 2 +- .../tika/parser/RecursiveParserWrapperTest.java | 17 +- .../apache/tika/parser/image/JpegParserTest.java | 2 +- .../tika/parser/microsoft/rtf/RTFParserTest.java | 3 +- .../src/test/resources/configs/tika-4533.json | 3 - .../configs/tika-config-bc-digests-base32.json | 1 - .../configs/tika-config-bc-digests-basic.json | 1 - .../configs/tika-config-bc-digests-multiple.json | 1 - .../configs/tika-config-commons-digests-basic.json | 1 - .../configs/tika-config-digests-pdf-only.json | 1 - .../tika-config-digests-skip-container.json | 1 - .../resources/configs/tika-config-digests.json | 1 - ...a-config-doubling-custom-handler-decorator.json | 1 - .../resources/configs/tika-config-no-names.json | 6 +- ...a-config-upcasing-custom-handler-decorator.json | 6 +- .../resources/configs/tika-config-with-names.json | 5 +- .../configs/tika-config-write-filter.json | 1 - .../src/main/resources/config-template.json | 1 - .../tika/pipes/core/server/ParseHandler.java | 20 +- .../apache/tika/pipes/fork/PipesForkParser.java | 6 + .../tika/pipes/fork/PipesForkParserConfig.java | 32 +- .../test/resources/configs/tika-config-basic.json | 2 - .../resources/configs/tika-config-passback.json | 2 - .../resources/configs/tika-config-truncate.json | 4 +- .../resources/configs/tika-config-uppercasing.json | 1 - .../configs/tika-config-write-limiter.json | 2 - .../org/apache/tika/config/loader/TikaLoader.java | 35 +- .../java/org/apache/tika/config/AllLimitsTest.java | 156 +++++++++ .../org/apache/tika/config/EmbeddedLimitsTest.java | 109 +++++++ .../org/apache/tika/config/OutputLimitsTest.java | 119 +++++++ .../org/apache/tika/config/TimeoutLimitsTest.java | 95 ++++++ .../writefilter/StandardMetadataLimiterTest.java | 6 +- .../test/resources/configs/TIKA-3695-exclude.json | 3 - .../test/resources/configs/TIKA-3695-fields.json | 3 - .../src/test/resources/configs/TIKA-3695.json | 3 - .../test/resources/configs/all-limits-test.json | 32 ++ .../resources/configs/embedded-limits-test.json | 10 + .../test/resources/configs/output-limits-test.json | 12 + .../resources/configs/timeout-limits-test.json | 7 + .../server/core/resource/DetectorResource.java | 2 +- .../server/core/resource/MetadataResource.java | 8 +- .../server/core/resource/PipesParsingHelper.java | 2 +- .../core/resource/RecursiveMetadataResource.java | 23 +- .../server/core/resource/ServerHandlerConfig.java | 5 +- .../tika/server/core/resource/TikaResource.java | 24 +- .../server/core/resource/UnpackerResource.java | 4 +- .../org/apache/tika/server/core/CXFTestBase.java | 1 - .../resources/configs/cxf-test-base-template.json | 1 - .../standard/resource/XMPMetadataResource.java | 4 +- .../resources/configs/cxf-test-base-template.json | 1 - .../configs/tika-config-for-server-tests.json | 1 - .../tika-config-langdetect-opennlp-filter.json | 1 - .../tika-config-langdetect-optimaize-filter.json | 1 - 135 files changed, 1972 insertions(+), 584 deletions(-) diff --cc tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index 90eaf73a6c,2ce72443b8..21f08a9191 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@@ -25,13 -26,13 +25,13 @@@ import org.apache.tika.metadata.Metadat import org.apache.tika.sax.ContentHandlerDecoratorFactory; /** - * This config object can be used to tune how conservative we want to be - * when parsing data that is extremely compressible and resembles a ZIP - * bomb. Null values will be ignored and will not affect the default values - * in SecureContentHandler. + * Configuration for AutoDetectParser behavior. + * <p> + * Note: Security limits (zip bomb thresholds, XML depth, etc.) are now configured + * via {@link org.apache.tika.config.OutputLimits} in the ParseContext, not here. * <p> * This is a config POJO. It uses standard Jackson deserialization for its - * primitive fields, but component fields (like embeddedDocumentExtractorFactory) + * primitive fields, but component fields (like contentHandlerDecoratorFactory) * use compact format. */ @TikaComponent(spi = false) @@@ -48,84 -49,25 +48,14 @@@ public class AutoDetectParserConfig imp public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig(); - /** - * SecureContentHandler -- Desired output threshold in characters. - */ - private Long outputThreshold = null; - - /** - * SecureContentHandler -- Desired maximum compression ratio. - */ - private Long maximumCompressionRatio = null; - - /** - * SecureContentHandler -- Desired maximum XML nesting level. - */ - private Integer maximumDepth = null; - - /** - * SecureContentHandler -- Desired maximum package entry nesting level. - */ - private Integer maximumPackageEntryDepth = null; - private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory = null; -- private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory = NOOP_CONTENT_HANDLER_DECORATOR_FACTORY; private boolean throwOnZeroBytes = true; - /** - * Creates a SecureContentHandlerConfig using the passed in parameters. - * - * @param outputThreshold SecureContentHandler - character output threshold. - * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed. - * @param maximumDepth SecureContentHandler - maximum XML element nesting level. - * @param maximumPackageEntryDepth SecureContentHandler - maximum package entry nesting level. - */ - public AutoDetectParserConfig(Long outputThreshold, - Long maximumCompressionRatio, Integer maximumDepth, - Integer maximumPackageEntryDepth) { - this.outputThreshold = outputThreshold; - this.maximumCompressionRatio = maximumCompressionRatio; - this.maximumDepth = maximumDepth; - this.maximumPackageEntryDepth = maximumPackageEntryDepth; - } - public AutoDetectParserConfig() { - - } - - public Long getOutputThreshold() { - return outputThreshold; - } - - public void setOutputThreshold(Long outputThreshold) { - this.outputThreshold = outputThreshold; - } - - public Long getMaximumCompressionRatio() { - return maximumCompressionRatio; - } - - public void setMaximumCompressionRatio(Long maximumCompressionRatio) { - this.maximumCompressionRatio = maximumCompressionRatio; - } - - public Integer getMaximumDepth() { - return maximumDepth; - } - - public void setMaximumDepth(Integer maximumDepth) { - this.maximumDepth = maximumDepth; - } - - public Integer getMaximumPackageEntryDepth() { - return maximumPackageEntryDepth; - } - - public void setMaximumPackageEntryDepth(Integer maximumPackageEntryDepth) { - this.maximumPackageEntryDepth = maximumPackageEntryDepth; } - public void setEmbeddedDocumentExtractorFactory( - EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory) { - this.embeddedDocumentExtractorFactory = embeddedDocumentExtractorFactory; - } - - public EmbeddedDocumentExtractorFactory getEmbeddedDocumentExtractorFactory() { - return embeddedDocumentExtractorFactory; - } - public void setContentHandlerDecoratorFactory( ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) { this.contentHandlerDecoratorFactory = contentHandlerDecoratorFactory; @@@ -145,11 -87,9 +75,8 @@@ @Override public String toString() { - return "AutoDetectParserConfig{" + "outputThreshold=" + - outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio + - ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" + - maximumPackageEntryDepth + ", contentHandlerDecoratorFactory=" + - contentHandlerDecoratorFactory + + return "AutoDetectParserConfig{" + - "embeddedDocumentExtractorFactory=" + embeddedDocumentExtractorFactory + - ", contentHandlerDecoratorFactory=" + contentHandlerDecoratorFactory + ++ "contentHandlerDecoratorFactory=" + contentHandlerDecoratorFactory + ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; } } diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json index 39a72c7516,9687287ca5..33fcd5ffd7 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json @@@ -1,13 -1,10 +1,9 @@@ { -- "auto-detect-parser": { - "outputThreshold": 678900 - }, - "embeddedDocumentExtractorFactory": { - "runpack-extractor-factory": { + "other-configs": { + "embedded-document-extractor-factory": { - "org.apache.tika.extractor.StandardExtractorFactory": { ++ "standard-extractor-factory": { "writeFileNameToContent": false } } } } -- diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json index f22dc93fd1,39ddeff844..8e9b5b6012 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json @@@ -1,20 -1,28 +1,16 @@@ { "auto-detect-parser": { - "outputThreshold": 1000, - "maximumCompressionRatio": 0.8, - "maximumDepth": 1000, - "maximumPackageEntryDepth": 1000, - "embeddedDocumentExtractorFactory": { - "runpack-extractor-factory": { - "writeFileNameToContent": true, - "embeddedBytesIncludeMimeTypes": [ - "text/pdf" - ], - "embeddedBytesExcludeMimeTypes": [ - "rtf/application" - ], - "embeddedBytesIncludeEmbeddedResourceTypes": [ - "appended" - ], - "embeddedBytesExcludeEmbeddedResourceTypes": [ - ], - "maxEmbeddedBytesForExtraction": 10737418240 - } - }, "contentHandlerDecoratorFactory": "upcasing-content-handler-decorator-factory", "throwOnZeroBytes": true }, "other-configs": { "digester-factory": { "commons-digester-factory": {} + }, + "embedded-document-extractor-factory": { - "org.apache.tika.extractor.StandardExtractorFactory": { ++ "standard-extractor-factory": { + "writeFileNameToContent": true + } } } } diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json index 3c02acbc8e,abea0d901e..28f542245b --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json @@@ -1,10 -1,7 +1,7 @@@ { -- "auto-detect-parser": { - "outputThreshold": 678900 - }, - "embeddedDocumentExtractorFactory": { - "runpack-extractor-factory": { + "other-configs": { + "embedded-document-extractor-factory": { - "org.apache.tika.extractor.StandardExtractorFactory": { ++ "standard-extractor-factory": { "writeFileNameToContent": true } } diff --cc tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index 81c95a3adf,e56132f268..6e86502d2b --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@@ -47,9 -47,8 +47,8 @@@ import org.apache.tika.parser.ParseReco import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.ParseMode; -import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; +import org.apache.tika.pipes.core.extractor.UnpackConfig; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; - import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ExceptionUtils; diff --cc tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json index d7687c77e8,50d9875b25..d8acd13939 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json @@@ -45,16 -44,17 +44,17 @@@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, - "embeddedDocumentExtractorFactory": { - "runpack-extractor-factory": { - "writeFileNameToContent": false, - "maxEmbeddedBytesForExtraction": 10 - } - }, "throwOnZeroBytes": false }, "other-configs": { "digester-factory": { "mock-digester-factory": {} + }, + "embedded-document-extractor-factory": { + "runpack-extractor-factory": { ++ "writeFileNameToContent": false, ++ "maxEmbeddedBytesForExtraction": 10 + } } }, "plugin-roots": "PLUGINS_PATHS" diff --cc tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 92806c2913,2c5960ed25..bc3f395157 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@@ -404,27 -407,19 +408,20 @@@ public class TikaLoader */ public ParseContext loadParseContext() throws TikaConfigException { ParseContext context = new ParseContext(); + loadOne(DigesterFactory.class, context); + loadOne(MetadataWriteLimiterFactory.class, context); ++ loadOne(EmbeddedDocumentExtractorFactory.class, context); + loadOne(EmbeddedLimits.class, context); + loadOne(OutputLimits.class, context); + loadOne(TimeoutLimits.class, context); + return context; + } - // Load DigesterFactory from other-configs if present - DigesterFactory digesterFactory = configs().load("digester-factory", DigesterFactory.class); - if (digesterFactory != null) { - context.set(DigesterFactory.class, digesterFactory); - } - - // Load MetadataWriteLimiterFactory from other-configs if present - MetadataWriteLimiterFactory metadataWriteLimiterFactory = configs().load(MetadataWriteLimiterFactory.class); - if (metadataWriteLimiterFactory != null) { - context.set(MetadataWriteLimiterFactory.class, metadataWriteLimiterFactory); - } - - // Load EmbeddedDocumentExtractorFactory from other-configs if present - EmbeddedDocumentExtractorFactory extractorFactory = - configs().load(EmbeddedDocumentExtractorFactory.class); - if (extractorFactory != null) { - context.set(EmbeddedDocumentExtractorFactory.class, extractorFactory); + private <T> void loadOne(Class<T> clazz, ParseContext context) throws TikaConfigException { + T instnce = configs().load(clazz); + if (instnce != null) { + context.set(clazz, instnce); } - - return context; } /**
