This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4734 in repository https://gitbox.apache.org/repos/asf/tika.git
commit dfe180f691bcc4ab22b81fee0bdd52149f9de14b Merge: 0b0dc4f75c cabd1f2d44 Author: tallison <[email protected]> AuthorDate: Wed May 27 10:39:09 2026 -0400 Merge branch 'main' into TIKA-4734 .github/workflows/docker-snapshot.yml | 4 +- .mvn/extensions.xml | 2 +- .skills/tika-eval-h2-query.md | 92 ++ docs/modules/ROOT/nav.adoc | 1 + .../integration-testing/run-uat-script.adoc | 10 +- .../advanced/integration-testing/tika-app.adoc | 8 +- .../integration-testing/tika-eval-regression.adoc | 364 +++++ .../advanced/integration-testing/tika-server.adoc | 18 +- .../pages/maintainers/release-guides/docker.adoc | 4 +- .../release-guides/release-artifacts.adoc | 10 +- .../pages/maintainers/release-guides/tika.adoc | 2 +- docs/modules/ROOT/pages/migration-to-4x/index.adoc | 2 +- .../migration-to-4x/migrating-tika-server-4x.adoc | 10 +- .../pages/migration-to-4x/migrating-to-4x.adoc | 4 +- docs/modules/ROOT/pages/pipes/configuration.adoc | 6 +- docs/modules/ROOT/pages/pipes/cpu-sizing.adoc | 2 +- docs/modules/ROOT/pages/pipes/parse-modes.adoc | 6 +- docs/modules/ROOT/pages/pipes/troubleshooting.adoc | 131 ++ docs/modules/ROOT/pages/using-tika/cli/index.adoc | 74 +- .../ROOT/pages/using-tika/server/index.adoc | 4 +- docs/modules/ROOT/pages/using-tika/server/tls.adoc | 2 +- pom.xml | 52 +- tika-app/pom.xml | 4 + .../main/java/org/apache/tika/cli/AsyncHelper.java | 21 +- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 78 +- .../apache/tika/cli/XmlToJsonConfigConverter.java | 8 +- .../java/org/apache/tika/cli/AsyncHelperTest.java | 30 +- .../tika/cli/XmlToJsonConfigConverterTest.java | 6 +- .../test/resources/configs/config-template.json | 2 - .../src/test/resources/configs/tika-config2.json | 2 +- .../ParsingEmbeddedDocumentExtractor.java | 29 +- .../tika/sax/BasicContentHandlerFactory.java | 30 + .../org/apache/tika/sax/StrictXHTMLValidator.java | 229 +++ .../org/apache/tika/sax/XHTMLBalancingHandler.java | 123 ++ .../src/test/java/org/apache/tika/TikaTest.java | 37 +- .../apache/tika/sax/XHTMLBalancingHandlerTest.java | 130 ++ .../test/resources/tika-config-ignite-local.json | 20 +- .../src/test/resources/tika-config-ignite.json | 20 +- tika-e2e-tests/tika-server/pom.xml | 3 +- .../tika/server/e2e/TikaServerHttp2Test.java | 25 +- .../apache/tika/ml/chardetect/AdaptiveProbe.java | 80 + .../tika/ml/chardetect/CharsetConfusables.java | 27 + .../tika/ml/chardetect/HtmlByteStripper.java | 198 ++- .../ml/chardetect/MojibusterEncodingDetector.java | 198 ++- .../NaiveBayesBigramEncodingDetector.java | 488 +++++- .../org/apache/tika/ml/chardetect/nb-bigram.bin | Bin 975490 -> 1016638 bytes .../tika/ml/chardetect/AdaptiveProbeTest.java | 118 ++ .../apache/tika/ml/chardetect/CalibrateTopK.java | 353 ++++ .../apache/tika/ml/chardetect/CheckUtf8OnFile.java | 83 + .../tika/ml/chardetect/HtmlByteStripperTest.java | 245 +++ .../ml/chardetect/InspectBigramContributions.java | 221 +++ .../apache/tika/ml/chardetect/TraceMojibuster.java | 233 +++ tika-eval/tika-eval-app/pom.xml | 4 + .../src/test/resources/s3/tika-config-s3.json | 2 +- .../chardetect/tools/BuildCharsetTrainingData.java | 23 +- .../chardetect/tools/DiagnoseDiscrimination.java | 399 +++++ .../chardetect/tools/RebalanceCharsetTraining.java | 209 +++ .../ml/chardetect/tools/TrainNaiveBayesBigram.java | 56 +- tika-ml/tika-ml-junkdetect/pom.xml | 12 + .../{V7Tables.java => BigramTables.java} | 36 +- .../tika/ml/junkdetect/HtmlContentCleaner.java | 108 ++ .../apache/tika/ml/junkdetect/JunkDetector.java | 842 ++++++---- .../ml/junkdetect/JunkFilterEncodingDetector.java | 274 ++-- .../tika/ml/junkdetect/TextQualityFeatures.java | 608 +++++++ .../ml/junkdetect/tools/AnalyzeHanByBlock.java | 201 --- .../ml/junkdetect/tools/BuildJunkTrainingData.java | 7 + .../ml/junkdetect/tools/CountPerScriptBigrams.java | 326 ---- .../tika/ml/junkdetect/tools/EvalJunkDetector.java | 777 --------- .../junkdetect/tools/EvalJunkOnCharsetDevtest.java | 688 -------- .../tools/JunkDetectorTrainingConfig.java | 9 +- .../junkdetect/tools/PrototypeCodepointHash.java | 1208 -------------- .../tika/ml/junkdetect/tools/TrainJunkModel.java | 1701 ++++++++++---------- .../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2810396 -> 2321862 bytes .../apache/tika/ml/junkdetect/EntityRefProbe.java | 164 -- ...rV7Test.java => JunkDetectorRoundTripTest.java} | 165 +- .../tika/ml/junkdetect/JunkDetectorSmokeTest.java | 7 +- .../junkdetect/JunkFilterEncodingDetectorTest.java | 43 + .../ml/junkdetect/LatinSiblingComparisonTest.java | 141 ++ .../ml/junkdetect/TextQualityFeaturesTest.java | 201 +++ .../apache/tika/ml/junkdetect/TraceJunkFilter.java | 536 ++++++ .../tools/BuildJunkAugmentationData.java | 862 ++++++++++ .../tools/BuildJunkAugmentationDataTest.java | 429 +++++ .../tools/JunkDetectorTrainingConfigTest.java | 5 +- tika-parent/pom.xml | 14 +- .../apache/tika/parser/pkg/PackageParserTest.java | 4 + .../resources/configs/tika-config-rendering.json | 2 +- .../tika/parser/iwork/PagesContentHandler.java | 19 +- .../java/org/apache/tika/parser/prt/PRTParser.java | 85 +- .../apache/tika/parser/code/SourceCodeParser.java | 25 + .../tika/parser/code/SourceCodeParserTest.java | 1 - .../microsoft/ooxml/AbstractOOXMLExtractor.java | 15 +- .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 55 + .../ooxml/OOXMLWordAndPowerPointTextHandler.java | 7 + .../microsoft/ooxml/SAXBasedMetadataExtractor.java | 89 +- .../ooxml/SXSLFPowerPointExtractorDecorator.java | 16 +- .../ooxml/SXWPFWordExtractorDecorator.java | 6 + .../ooxml/XSSFExcelExtractorDecorator.java | 36 + .../ooxml/SAXBasedMetadataExtractorTest.java | 216 +++ .../test/resources/configs/tika-libpst-config.json | 2 +- .../resources/configs/tika-libpst-eml-config.json | 2 +- .../org/apache/tika/parser/epub/EpubParser.java | 44 +- .../tika/parser/odf/OpenDocumentBodyHandler.java | 77 + .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 25 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 14 + .../test-documents/testPDF_jsActionOnPage.pdf | 26 + .../java/org/apache/tika/parser/pkg/ZipParser.java | 29 +- .../java/org/apache/tika/parser/txt/TXTParser.java | 27 +- .../java/org/apache/tika/parser/tmx/TMXParser.java | 2 + .../apache/tika/parser/xliff/XLIFF12Parser.java | 3 +- .../org/apache/tika/async/cli/PluginsWriter.java | 203 ++- .../apache/tika/async/cli/SimpleAsyncConfig.java | 13 + .../org/apache/tika/async/cli/TikaAsyncCLI.java | 16 +- .../apache/tika/async/cli/AsyncCliParserTest.java | 25 + .../apache/tika/async/cli/AsyncProcessorTest.java | 42 + ...plate.json => config-content-only-default.json} | 7 +- .../test/resources/configs/config-template.json | 2 - tika-pipes/tika-pipes-core/pom.xml | 9 + .../tika/pipes/core/AbstractComponentManager.java | 15 + .../tika/pipes/core/PerClientServerManager.java | 45 +- .../org/apache/tika/pipes/core/PipesConfig.java | 26 +- .../apache/tika/pipes/core/ServerProcessIO.java | 112 ++ .../tika/pipes/core/SharedServerManager.java | 41 +- .../tika/pipes/core/config/ConfigMerger.java | 3 - .../tika/pipes/core/config/ConfigOverrides.java | 14 +- .../apache/tika/pipes/core/server/PipesServer.java | 54 + .../apache/tika/pipes/core/server/PipesWorker.java | 3 + .../tika/pipes/core/config/ConfigMergerTest.java | 3 +- .../core/testutil/AbstractConfigExamplesTest.java | 89 + .../apache/tika/pipes/fork/PipesForkParser.java | 1 - .../tika/pipes/fork/PipesForkParserConfig.java | 11 - tika-pipes/tika-pipes-plugins/pom.xml | 7 + .../pipes/atlassianjwt/ConfigExamplesTest.java | 33 +- .../tika/pipes/azblob/ConfigExamplesTest.java | 71 +- .../apache/tika/pipes/csv/ConfigExamplesTest.java | 33 +- .../apache/tika/pipes/es/ConfigExamplesTest.java | 63 +- .../tika/pipes/emitter/fs/FileSystemEmitter.java | 18 +- .../fs/FileSystemEmitterRuntimeConfigTest.java | 31 + .../apache/tika/pipes/fs/ConfigExamplesTest.java | 28 +- .../config-examples/file-system-emitter.json | 7 +- .../config-examples/file-system-fetcher.json | 7 +- .../config-examples/file-system-pipeline.json | 14 +- .../apache/tika/pipes/gcs/ConfigExamplesTest.java | 53 +- .../tika/pipes/googledrive/ConfigExamplesTest.java | 33 +- .../apache/tika/pipes/http/ConfigExamplesTest.java | 34 +- .../apache/tika/pipes/jdbc/ConfigExamplesTest.java | 52 +- .../apache/tika/pipes/json/ConfigExamplesTest.java | 33 +- .../tika/pipes/kafka/ConfigExamplesTest.java | 51 +- .../tika-pipes-microsoft-graph/pom.xml | 2 +- .../pipes/microsoftgraph/ConfigExamplesTest.java | 33 +- .../tika/pipes/opensearch/ConfigExamplesTest.java | 50 +- .../apache/tika/pipes/s3/ConfigExamplesTest.java | 53 +- .../apache/tika/pipes/solr/ConfigExamplesTest.java | 52 +- .../config/loader/AbstractSpiComponentLoader.java | 45 +- .../tika/config/loader/ComponentInstantiator.java | 14 +- .../apache/tika/config/loader/ParserLoader.java | 7 + .../apache/tika/config/loader/TikaJsonConfig.java | 2 +- .../apache/tika/config/loader/TikaLoaderTest.java | 115 +- tika-server/README.md | 10 +- tika-server/docker-build/CHANGES.md | 4 +- tika-server/docker-build/README.md | 8 +- .../docker-build/docker-compose-tika-customocr.yml | 10 +- .../docker-build/docker-compose-tika-grobid.yml | 10 +- tika-server/docker-build/full/Dockerfile | 24 +- tika-server/docker-build/full/Dockerfile.snapshot | 2 +- tika-server/docker-build/minimal/Dockerfile | 24 +- .../docker-build/minimal/Dockerfile.snapshot | 2 +- .../tika/server/core/IntegrationTestBase.java | 31 + .../server/core/benchmark/TikaServerBenchmark.java | 2 +- .../bin/install_tika_service.sh | 21 +- tika-server/tika-server-standard/bin/tika | 10 +- tika-server/tika-server-standard/bin/tika.in.sh | 2 +- tika-server/tika-server-standard/pom.xml | 45 + .../src/main/assembly/assembly.xml | 5 +- tika-translate/pom.xml | 2 +- 174 files changed, 10721 insertions(+), 5899 deletions(-)
