This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4723 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d32e187fec63406fe451fcf72070ce739e938391 Merge: 08365d977e 0b6583091b Author: tallison <[email protected]> AuthorDate: Tue May 12 11:19:03 2026 -0400 Merge remote-tracking branch 'origin/main' into TIKA-4723 .github/workflows/docker-release.yml | 169 ++++++++++-- .github/workflows/docker-snapshot.yml | 4 +- docs/.gitignore | 3 + docs/build-docs.sh | 53 ---- .../ROOT/examples/migration-full-example.json | 2 +- docs/modules/ROOT/examples/pdf-parser-basic.json | 2 +- docs/modules/ROOT/examples/pdf-parser-full.json | 2 +- .../ROOT/examples/pipes-atlassian-jwt-fetcher.json | 1 + .../ROOT/examples/pipes-azblob-emitter.json | 1 + .../ROOT/examples/pipes-azblob-fetcher.json | 1 + .../ROOT/examples/pipes-azblob-iterator.json | 1 + .../ROOT/examples/pipes-azblob-pipeline.json | 1 + .../ROOT/examples/pipes-config-template.json | 1 + docs/modules/ROOT/examples/pipes-csv-iterator.json | 1 + .../ROOT/examples/pipes-elasticsearch-emitter.json | 1 + .../examples/pipes-elasticsearch-pipeline.json | 1 + .../examples/pipes-elasticsearch-reporter.json | 1 + docs/modules/ROOT/examples/pipes-emit-all.json | 1 + docs/modules/ROOT/examples/pipes-fs-emitter.json | 1 - docs/modules/ROOT/examples/pipes-fs-fetcher.json | 1 - docs/modules/ROOT/examples/pipes-fs-pipeline.json | 2 +- docs/modules/ROOT/examples/pipes-gcs-emitter.json | 1 + docs/modules/ROOT/examples/pipes-gcs-fetcher.json | 1 + docs/modules/ROOT/examples/pipes-gcs-iterator.json | 1 + docs/modules/ROOT/examples/pipes-gcs-pipeline.json | 1 + .../ROOT/examples/pipes-google-drive-fetcher.json | 1 + docs/modules/ROOT/examples/pipes-http-fetcher.json | 1 + docs/modules/ROOT/examples/pipes-jdbc-emitter.json | 1 + .../modules/ROOT/examples/pipes-jdbc-iterator.json | 1 + .../modules/ROOT/examples/pipes-jdbc-pipeline.json | 1 + .../modules/ROOT/examples/pipes-jdbc-reporter.json | 1 + .../modules/ROOT/examples/pipes-json-iterator.json | 1 + .../modules/ROOT/examples/pipes-kafka-emitter.json | 1 + .../ROOT/examples/pipes-kafka-iterator.json | 1 + .../ROOT/examples/pipes-kafka-pipeline.json | 1 + .../examples/pipes-microsoft-graph-fetcher.json | 1 + .../ROOT/examples/pipes-opensearch-emitter.json | 1 + .../ROOT/examples/pipes-opensearch-pipeline.json | 1 + .../ROOT/examples/pipes-opensearch-reporter.json | 1 + docs/modules/ROOT/examples/pipes-s3-emitter.json | 1 + docs/modules/ROOT/examples/pipes-s3-fetcher.json | 1 + docs/modules/ROOT/examples/pipes-s3-iterator.json | 1 + docs/modules/ROOT/examples/pipes-s3-pipeline.json | 1 + .../modules/ROOT/examples/pipes-shared-server.json | 1 + .../ROOT/examples/pipes-solr-emitter-zk.json | 1 + docs/modules/ROOT/examples/pipes-solr-emitter.json | 1 + .../modules/ROOT/examples/pipes-solr-iterator.json | 1 + .../modules/ROOT/examples/pipes-solr-pipeline.json | 1 + docs/modules/ROOT/examples/tesseract-basic.json | 2 +- docs/modules/ROOT/examples/tesseract-full.json | 2 +- docs/modules/ROOT/nav.adoc | 18 +- .../pages/advanced/charset-detection-design.adoc | 2 +- .../integration-testing/run-uat-script.adoc | 124 +++++++++ .../ROOT/pages/advanced/junk-detection-build.adoc | 16 +- .../ROOT/pages/advanced/language-detection.adoc | 19 -- .../pages/configuration/encoding-detectors.adoc | 183 +++++++------ .../configuration/parsers/external-parser.adoc | 8 +- .../pages/maintainers/release-guides/docker.adoc | 299 +++++++++++++++++---- .../pages/maintainers/release-guides/tika.adoc | 168 +++++++++++- docs/modules/ROOT/pages/maintainers/site.adoc | 36 +-- .../pages/migration-to-4x/design-notes-4x.adoc | 2 +- docs/modules/ROOT/pages/migration-to-4x/index.adoc | 2 + docs/modules/ROOT/pages/pipes/configuration.adoc | 53 +++- docs/modules/ROOT/pages/pipes/cpu-sizing.adoc | 33 +++ docs/modules/ROOT/pages/pipes/emitters.adoc | 245 +++++------------ docs/modules/ROOT/pages/pipes/fetchers.adoc | 264 ++++-------------- docs/modules/ROOT/pages/pipes/getting-started.adoc | 4 +- docs/modules/ROOT/pages/pipes/index.adoc | 2 +- docs/modules/ROOT/pages/pipes/iterators.adoc | 230 ++++------------ docs/modules/ROOT/pages/pipes/parse-modes.adoc | 143 +++++++--- .../ROOT/pages/pipes/plugins/atlassian-jwt.adoc | 121 +++++++++ docs/modules/ROOT/pages/pipes/plugins/azblob.adoc | 185 +++++++++++++ docs/modules/ROOT/pages/pipes/plugins/csv.adoc | 75 ++++++ .../ROOT/pages/pipes/plugins/elasticsearch.adoc | 196 ++++++++++++++ .../ROOT/pages/pipes/plugins/filesystem.adoc | 255 ++++++++++++++++++ docs/modules/ROOT/pages/pipes/plugins/gcs.adoc | 166 ++++++++++++ .../ROOT/pages/pipes/plugins/google-drive.adoc | 79 ++++++ docs/modules/ROOT/pages/pipes/plugins/http.adoc | 132 +++++++++ docs/modules/ROOT/pages/pipes/plugins/index.adoc | 133 +++++++++ docs/modules/ROOT/pages/pipes/plugins/jdbc.adoc | 241 +++++++++++++++++ docs/modules/ROOT/pages/pipes/plugins/json.adoc | 63 +++++ docs/modules/ROOT/pages/pipes/plugins/kafka.adoc | 213 +++++++++++++++ .../ROOT/pages/pipes/plugins/microsoft-graph.adoc | 85 ++++++ .../ROOT/pages/pipes/plugins/opensearch.adoc | 176 ++++++++++++ docs/modules/ROOT/pages/pipes/plugins/s3.adoc | 242 +++++++++++++++++ docs/modules/ROOT/pages/pipes/plugins/solr.adoc | 202 ++++++++++++++ docs/modules/ROOT/pages/pipes/reporters.adoc | 99 +++---- .../ROOT/pages/pipes/shared-server-mode.adoc | 2 + docs/modules/ROOT/pages/using-tika/grpc/index.adoc | 22 ++ docs/pom.xml | 82 +++++- docs/publish-docs.sh | 51 ++++ .../org/apache/tika/mime/tika-mimetypes.xml | 6 +- tika-parent/pom.xml | 16 +- .../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 5 +- .../resources/config-examples/pdf-parser-full.json | 1 + .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 + .../apache/tika/parser/pdf/PDFParserConfig.java | 24 ++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 28 ++ .../pipes/atlassianjwt/ConfigExamplesTest.java | 69 +++++ .../config-examples/atlassian-jwt-fetcher.json | 19 ++ .../tika/pipes/azblob/ConfigExamplesTest.java | 134 +++++++++ .../resources/config-examples/az-blob-emitter.json | 14 + .../resources/config-examples/az-blob-fetcher.json | 13 + .../config-examples/az-blob-pipeline.json | 45 ++++ .../config-examples/az-blob-pipes-iterator.json | 13 + .../config/tika-config-az-blob-fetcher.xml | 30 --- .../test/resources/config/tika-config-az-blob.xml | 28 -- .../apache/tika/pipes/csv/ConfigExamplesTest.java | 70 +++++ .../config-examples/csv-pipes-iterator.json | 12 + .../apache/tika/pipes/es/ConfigExamplesTest.java | 126 +++++++++ .../test/resources/config-examples/es-emitter.json | 19 ++ .../resources/config-examples/es-pipeline.json | 60 +++++ .../resources/config-examples/es-reporter.json | 15 ++ .../apache/tika/pipes/gcs/ConfigExamplesTest.java | 133 +++++++++ .../resources/config-examples/gcs-emitter.json | 12 + .../resources/config-examples/gcs-fetcher.json | 12 + .../resources/config-examples/gcs-pipeline.json | 42 +++ .../config-examples/gcs-pipes-iterator.json | 11 + .../src/test/resources/config/tika-config-gcs.xml | 26 -- .../tika-pipes-google-drive/pom.xml | 2 +- .../tika/pipes/googledrive/ConfigExamplesTest.java | 70 +++++ .../config-examples/google-drive-fetcher.json | 13 + .../apache/tika/pipes/http/ConfigExamplesTest.java | 70 +++++ .../resources/config-examples/http-fetcher.json | 21 ++ .../reporter/jdbc/JDBCPipesReporterConfig.java | 27 +- .../apache/tika/pipes/jdbc/ConfigExamplesTest.java | 150 +++++++++++ .../resources/config-examples/jdbc-emitter.json | 22 ++ .../resources/config-examples/jdbc-pipeline.json | 56 ++++ .../config-examples/jdbc-pipes-iterator.json | 15 ++ .../resources/config-examples/jdbc-reporter.json | 12 + .../tika-config-jdbc-emitter-attachments.xml | 53 ---- .../tika-config-jdbc-emitter-existing-table.xml | 42 --- .../tika-config-jdbc-emitter-multivalued.xml | 45 ---- .../configs/tika-config-jdbc-emitter-trunc.xml | 44 --- .../resources/configs/tika-config-jdbc-emitter.xml | 54 ---- .../apache/tika/pipes/json/ConfigExamplesTest.java | 67 +++++ .../config-examples/json-pipes-iterator.json | 9 + .../tika/pipes/kafka/ConfigExamplesTest.java | 119 ++++++++ .../resources/config-examples/kafka-emitter.json | 19 ++ .../resources/config-examples/kafka-pipeline.json | 43 +++ .../config-examples/kafka-pipes-iterator.json | 14 + .../tika-pipes-microsoft-graph/pom.xml | 2 +- .../pipes/microsoftgraph/ConfigExamplesTest.java | 72 +++++ .../config-examples/microsoft-graph-fetcher.json | 15 ++ .../tika/pipes/opensearch/ConfigExamplesTest.java | 123 +++++++++ .../config-examples/opensearch-emitter.json | 21 ++ .../config-examples/opensearch-pipeline.json | 64 +++++ .../config-examples/opensearch-reporter.json | 17 ++ .../test/resources/tika-config-simple-emitter.xml | 41 --- .../apache/tika/pipes/s3/ConfigExamplesTest.java | 136 ++++++++++ .../test/resources/config-examples/s3-emitter.json | 14 + .../test/resources/config-examples/s3-fetcher.json | 15 ++ .../resources/config-examples/s3-pipeline.json | 49 ++++ .../config-examples/s3-pipes-iterator.json | 13 + .../apache/tika/pipes/solr/ConfigExamplesTest.java | 134 +++++++++ .../resources/config-examples/solr-emitter-zk.json | 15 ++ .../resources/config-examples/solr-emitter.json | 17 ++ .../resources/config-examples/solr-pipeline.json | 42 +++ .../config-examples/solr-pipes-iterator.json | 15 ++ .../test/resources/tika-config-simple-emitter.xml | 48 ---- tika-server/docker-build/CHANGES.md | 110 ++++++++ tika-server/docker-build/README.md | 288 ++++++++++++++++++++ .../docker-build/docker-compose-tika-customocr.yml | 39 +++ .../docker-build/docker-compose-tika-grobid.yml | 45 ++++ .../docker-build/docker-compose-tika-vision.yml | 62 +++++ tika-server/docker-build/docker-tool.sh | 87 +++++- tika-server/docker-build/full/Dockerfile | 59 ++-- tika-server/docker-build/full/Dockerfile.snapshot | 12 +- tika-server/docker-build/minimal/Dockerfile | 52 ++-- .../docker-build/minimal/Dockerfile.snapshot | 12 +- .../customocr/tika-config-inline.json | 11 + .../customocr/tika-config-inline.xml | 31 --- .../customocr/tika-config-rendered.json | 16 ++ .../customocr/tika-config-rendered.xml | 38 --- .../sample-configs/grobid/tika-config.json | 10 + .../sample-configs/grobid/tika-config.xml | 24 -- .../sample-configs/ner/run_tika_server.sh | 62 ----- .../sample-configs/ner/tika-config.xml | 28 -- .../vision/inception-rest-caption.xml | 32 --- .../sample-configs/vision/inception-rest-video.xml | 32 --- .../sample-configs/vision/inception-rest.xml | 32 --- .../sample-configs/vision/vlm-claude.json | 18 ++ .../sample-configs/vision/vlm-gemini.json | 17 ++ .../sample-configs/vision/vlm-openai.json | 19 ++ 184 files changed, 7158 insertions(+), 1739 deletions(-)
