This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4639 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 064dcbfda3638c43bfbd821f0d96c36b1c9cd9da Merge: 6aed8956a4 589d1c25b1 Author: tallison <[email protected]> AuthorDate: Thu Jan 29 08:05:08 2026 -0500 Merge branch 'main' of https://github.com/apache/tika into TIKA-4639 .../test/java/org/apache/tika/cli/TikaCLITest.java | 41 ++++--- .../ParsingEmbeddedDocumentExtractor.java | 20 ++-- .../tika/extractor/RUnpackExtractorFactory.java | 121 -------------------- ...rFactory.java => StandardExtractorFactory.java} | 18 +-- .../java/org/apache/tika/io/FilenameUtils.java | 2 +- .../org/apache/tika/parser/AutoDetectParser.java | 7 +- .../apache/tika/parser/AutoDetectParserConfig.java | 17 +-- .../java/org/apache/tika/parser/EmptyParser.java | 2 +- .../tika/parser/external/ExternalParser.java | 2 +- .../tika/parser/external2/ExternalParser.java | 2 +- .../java/org/apache/tika/sax/SAXOutputConfig.java | 76 +++++++++++++ .../org/apache/tika/sax/XHTMLContentHandler.java | 87 +++++++++++---- .../apache/tika/sax/XHTMLContentHandlerTest.java | 123 +++++++++++++++++++++ .../org/apache/tika/example/RollbackSoftware.java | 2 +- .../org/apache/custom/parser/MyCustomParser.java | 2 +- .../apache/tika/parser/envi/EnviHeaderParser.java | 2 +- .../org/apache/tika/parser/gdal/GDALParser.java | 8 +- .../geoinfo/GeographicInformationParser.java | 2 +- .../org/apache/tika/parser/grib/GribParser.java | 2 +- .../java/org/apache/tika/parser/hdf/HDFParser.java | 2 +- .../apache/tika/parser/isatab/ISArchiveParser.java | 2 +- .../apache/tika/parser/netcdf/NetCDFParser.java | 2 +- .../apache/tika/parser/ner/NamedEntityParser.java | 2 +- .../parser/transcribe/aws/AmazonTranscribe.java | 2 +- .../tika/parser/apple/AppleSingleFileParser.java | 2 +- .../org/apache/tika/parser/apple/PListParser.java | 2 +- .../tika/parser/iwork/IWorkPackageParser.java | 2 +- .../parser/iwork/iwana/IWork13PackageParser.java | 2 +- .../org/apache/tika/parser/audio/AudioParser.java | 2 +- .../org/apache/tika/parser/audio/MidiParser.java | 2 +- .../java/org/apache/tika/parser/mp3/Mp3Parser.java | 2 +- .../java/org/apache/tika/parser/mp4/MP4Parser.java | 2 +- .../org/apache/tika/parser/ogg/FlacParser.java | 2 +- .../java/org/apache/tika/parser/ogg/OggParser.java | 2 +- .../org/apache/tika/parser/ogg/OpusParser.java | 2 +- .../org/apache/tika/parser/ogg/SpeexParser.java | 2 +- .../org/apache/tika/parser/ogg/TheoraParser.java | 2 +- .../org/apache/tika/parser/ogg/VorbisParser.java | 2 +- .../org/apache/tika/parser/video/FLVParser.java | 2 +- .../org/apache/tika/parser/dgn/DGN8Parser.java | 2 +- .../java/org/apache/tika/parser/dwg/DWGParser.java | 2 +- .../org/apache/tika/parser/dwg/DWGReadParser.java | 2 +- .../java/org/apache/tika/parser/prt/PRTParser.java | 2 +- .../org/apache/tika/parser/asm/ClassParser.java | 2 +- .../apache/tika/parser/asm/XHTMLClassVisitor.java | 5 +- .../apache/tika/parser/code/SourceCodeParser.java | 2 +- .../tika/parser/executable/ExecutableParser.java | 2 +- .../executable/UniversalExecutableParser.java | 2 +- .../java/org/apache/tika/parser/mat/MatParser.java | 2 +- .../org/apache/tika/parser/sas/SAS7BDATParser.java | 2 +- .../org/apache/tika/parser/crypto/TSDParser.java | 2 +- .../tika/parser/font/AdobeFontMetricParser.java | 2 +- .../apache/tika/parser/font/TrueTypeParser.java | 2 +- .../org/apache/tika/parser/html/HtmlHandler.java | 2 +- .../tika/parser/image/AbstractImageParser.java | 4 +- .../org/apache/tika/parser/image/ICNSParser.java | 2 +- .../org/apache/tika/parser/image/PSDParser.java | 2 +- .../org/apache/tika/parser/image/WebPParser.java | 2 +- .../apache/tika/parser/jdbc/AbstractDBParser.java | 2 +- .../org/apache/tika/parser/mail/RFC822Parser.java | 2 +- .../org/apache/tika/parser/mbox/MboxParser.java | 2 +- .../apache/tika/parser/microsoft/EMFParser.java | 2 +- .../tika/parser/microsoft/JackcessParser.java | 2 +- .../tika/parser/microsoft/MSOwnerFileParser.java | 2 +- .../apache/tika/parser/microsoft/OfficeParser.java | 2 +- .../tika/parser/microsoft/OldExcelParser.java | 2 +- .../apache/tika/parser/microsoft/TNEFParser.java | 2 +- .../apache/tika/parser/microsoft/WMFParser.java | 2 +- .../microsoft/activemime/ActiveMimeParser.java | 2 +- .../tika/parser/microsoft/chm/ChmParser.java | 2 +- .../tika/parser/microsoft/libpst/LibPstParser.java | 2 +- .../parser/microsoft/onenote/OneNoteParser.java | 2 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 2 +- .../ooxml/xwpf/ml2006/Word2006MLParser.java | 2 +- .../parser/microsoft/pst/OutlookPSTParser.java | 2 +- .../parser/microsoft/pst/PSTMailItemParser.java | 2 +- .../tika/parser/microsoft/rtf/RTFParser.java | 2 +- .../microsoft/xml/AbstractXML2003Parser.java | 2 +- .../java/org/apache/tika/parser/dbf/DBFParser.java | 2 +- .../java/org/apache/tika/parser/dif/DIFParser.java | 2 +- .../org/apache/tika/parser/epub/EpubParser.java | 2 +- .../org/apache/tika/parser/hwp/HwpV5Parser.java | 2 +- .../apache/tika/parser/indesign/IDMLParser.java | 2 +- .../java/org/apache/tika/parser/mif/MIFParser.java | 2 +- .../tika/parser/odf/FlatOpenDocumentParser.java | 2 +- .../tika/parser/odf/OpenDocumentContentParser.java | 2 +- .../apache/tika/parser/odf/OpenDocumentParser.java | 2 +- .../tika/parser/wordperfect/QuattroProParser.java | 2 +- .../tika/parser/wordperfect/WordPerfectParser.java | 2 +- .../org/apache/tika/parser/feed/FeedParser.java | 2 +- .../apache/tika/parser/iptc/IptcAnpaParser.java | 2 +- .../apache/tika/parser/ocr/TesseractOCRParser.java | 2 +- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 2 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +- .../apache/tika/parser/pkg/CompressorParser.java | 2 +- .../org/apache/tika/parser/pkg/PackageParser.java | 2 +- .../java/org/apache/tika/parser/pkg/RarParser.java | 2 +- .../org/apache/tika/parser/pkg/UnrarParser.java | 2 +- .../apache/tika/parser/csv/TextAndCSVParser.java | 8 +- .../tika/parser/strings/Latin1StringsParser.java | 2 +- .../apache/tika/parser/strings/StringsParser.java | 2 +- .../java/org/apache/tika/parser/txt/TXTParser.java | 2 +- .../org/apache/tika/parser/http/HttpParser.java | 2 +- .../org/apache/tika/parser/wacz/WACZParser.java | 2 +- .../org/apache/tika/parser/warc/WARCParser.java | 2 +- .../java/org/apache/tika/parser/tmx/TMXParser.java | 2 +- .../apache/tika/parser/xliff/XLIFF12Parser.java | 2 +- .../org/apache/tika/parser/xliff/XLZParser.java | 2 +- .../java/org/apache/tika/parser/xml/XMLParser.java | 2 +- .../apache/tika/parser/AutoDetectParserTest.java | 23 ---- .../tika/parser/AutoDetectReaderParserTest.java | 2 +- .../resources/configs/tika-config-no-names.json | 7 +- ...a-config-upcasing-custom-handler-decorator.json | 22 +--- .../resources/configs/tika-config-with-names.json | 6 +- .../org/apache/tika/async/cli/TikaAsyncCLI.java | 10 +- .../apache/tika/async/cli/AsyncProcessorTest.java | 12 +- .../AbstractEmbeddedDocumentBytesHandler.java | 49 +++----- .../BasicEmbeddedDocumentBytesHandler.java | 57 ---------- .../EmittingEmbeddedDocumentBytesHandler.java | 14 +-- .../pipes/core}/extractor/RUnpackExtractor.java | 13 +-- .../core/extractor/RUnpackExtractorFactory.java | 24 ++-- ...dDocumentBytesConfig.java => UnpackConfig.java} | 116 +++++++++++++++---- .../apache/tika/pipes/core/server/EmitHandler.java | 12 +- .../tika/pipes/core/server/ParseHandler.java | 8 +- .../apache/tika/pipes/core/server/PipesServer.java | 14 +-- .../apache/tika/pipes/core/server/PipesWorker.java | 34 +++--- .../core/extractor/UnpackConfigSelectorTest.java | 33 +++--- .../core/serialization/JsonFetchEmitTupleTest.java | 2 +- tika-pipes/tika-pipes-integration-tests/pom.xml | 12 ++ .../pipes/core/DigestingOpenContainersTest.java | 66 +++++++++++ .../apache/tika/pipes/core/PipesServerTest.java | 97 +--------------- .../src/test/resources/configs/tika-4533.json | 19 ++++ .../resources/configs/tika-config-truncate.json | 12 +- .../resources/test-documents/testLargeOLEDoc.doc | Bin 0 -> 2077696 bytes .../tika/config/loader/ComponentRegistry.java | 6 +- .../org/apache/tika/config/loader/TikaLoader.java | 4 + .../configs/TIKA-4207-embedded-bytes-config.json | 14 --- .../tika/server/core/resource/AsyncResource.java | 10 +- .../apache/tika/server/standard/TikaPipesTest.java | 10 +- 139 files changed, 751 insertions(+), 684 deletions(-) diff --cc tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 70fe1ad08f,ae9a33e170..752c0c2e35 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@@ -28,9 -28,10 +28,9 @@@ import org.apache.tika.exception.TikaEx import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; - import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory; + import org.apache.tika.extractor.StandardExtractorFactory; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.sax.SecureContentHandler;
