This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4725-temporarily-disable-docker-rel in repository https://gitbox.apache.org/repos/asf/tika.git
commit c994c9aee165804401f9668de3263206cb443ee0 Author: tallison <[email protected]> AuthorDate: Mon May 11 16:57:28 2026 -0400 TIKA-4725 - temporarily disable automatic release, many updates for tika-server for parity with work in tika-docker repo today --- .github/workflows/docker-release.yml | 27 ++- .github/workflows/docker-snapshot.yml | 4 +- docs/modules/ROOT/nav.adoc | 1 + .../integration-testing/run-uat-script.adoc | 124 ++++++++++ .../pages/maintainers/release-guides/docker.adoc | 196 ++++++++++------ tika-server/docker-build/CHANGES.md | 74 ++++++ tika-server/docker-build/README.md | 254 +++++++++++++++++++++ .../docker-build/docker-compose-tika-customocr.yml | 39 ++++ .../docker-build/docker-compose-tika-grobid.yml | 45 ++++ .../docker-build/docker-compose-tika-vision.yml | 62 +++++ tika-server/docker-build/docker-tool.sh | 51 ++++- tika-server/docker-build/full/Dockerfile | 59 ++--- tika-server/docker-build/full/Dockerfile.snapshot | 12 +- tika-server/docker-build/minimal/Dockerfile | 52 +++-- .../docker-build/minimal/Dockerfile.snapshot | 12 +- .../customocr/tika-config-inline.json | 11 + .../customocr/tika-config-inline.xml | 31 --- .../customocr/tika-config-rendered.json | 16 ++ .../customocr/tika-config-rendered.xml | 38 --- .../sample-configs/grobid/tika-config.json | 10 + .../sample-configs/grobid/tika-config.xml | 24 -- .../sample-configs/ner/run_tika_server.sh | 62 ----- .../sample-configs/ner/tika-config.xml | 28 --- .../vision/inception-rest-caption.xml | 32 --- .../sample-configs/vision/inception-rest-video.xml | 32 --- .../sample-configs/vision/inception-rest.xml | 32 --- .../sample-configs/vision/vlm-claude.json | 18 ++ .../sample-configs/vision/vlm-gemini.json | 17 ++ .../sample-configs/vision/vlm-openai.json | 19 ++ 29 files changed, 968 insertions(+), 414 deletions(-) diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index dadf630887..54f18f5fd7 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -17,10 +17,17 @@ name: Docker release - tika-server and tika-grpc +# Auto-trigger on tag push is disabled (TIKA-4725). The official tika-docker +# images on Docker Hub (apache/tika) are published from the apache/tika-docker +# repository using its own Dockerfiles and tagging conventions. When this +# workflow ran on the 4.0.0-alpha-1 source tag it pushed an image built from +# the stale Dockerfiles under tika-server/docker-build/ to +# apache/tika:4.0.0-alpha-1, which collided with the tika-docker-managed tag +# and ran with the pre-4.x bare-jar entrypoint (broken plugin loading). Re-enable +# only after the in-repo Dockerfiles are kept in sync with (or replaced by a +# pointer to) apache/tika-docker. on: - push: - tags: - - '[0-9]+.[0-9]+.[0-9]+*' + workflow_dispatch: jobs: release-tika-server: @@ -52,25 +59,27 @@ jobs: uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: file: tika-server/docker-build/minimal/Dockerfile - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tag }} + # :latest is intentionally NOT pushed. It stays on 3.x (published from + # the external apache/tika-docker repo) until 4.0.0 GA, at which point + # add `apache/tika:latest` back here. tags: | apache/tika:${{ steps.version.outputs.tag }} - apache/tika:latest - name: Build and push tika-server full uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: file: tika-server/docker-build/full/Dockerfile - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tag }} + # :latest-full stays on 3.x until 4.0.0 GA; see note above. tags: | apache/tika:${{ steps.version.outputs.tag }}-full - apache/tika:latest-full release-tika-grpc: runs-on: ubuntu-latest @@ -152,6 +161,10 @@ jobs: push: true build-args: | VERSION=${{ steps.version.outputs.tag }} + # apache/tika-grpc is new in 4.x with no prior `:latest` to protect, so + # we track latest from the start. Unlike apache/tika (the server image) + # where :latest stays on 3.x until 4.0.0 GA, the grpc image has no 3.x + # incumbent. tags: | apache/tika-grpc:${{ steps.version.outputs.tag }} apache/tika-grpc:latest diff --git a/.github/workflows/docker-snapshot.yml b/.github/workflows/docker-snapshot.yml index cb82d05592..1b17355ad4 100644 --- a/.github/workflows/docker-snapshot.yml +++ b/.github/workflows/docker-snapshot.yml @@ -105,7 +105,7 @@ jobs: uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: context: target/tika-server-minimal-docker - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tika_version }} @@ -157,7 +157,7 @@ jobs: uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: context: target/tika-server-full-docker - platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + platforms: linux/amd64,linux/arm64,linux/s390x push: true build-args: | TIKA_VERSION=${{ steps.version.outputs.tika_version }} diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 979555022a..b333e25fc6 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -55,6 +55,7 @@ ** xref:advanced/spooling.adoc[Spooling] ** xref:advanced/embedded-documents.adoc[Embedded Document Metadata] ** xref:advanced/local-vlm-server.adoc[Running a Local VLM Server] +** xref:advanced/integration-testing/run-uat-script.adoc[Tika-Server REST UAT Script] * xref:developers/index.adoc[Developers] ** xref:developers/serialization.adoc[Serialization and Configuration] * xref:faq.adoc[FAQ] diff --git a/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc new file mode 100644 index 0000000000..1e3365cd52 --- /dev/null +++ b/docs/modules/ROOT/pages/advanced/integration-testing/run-uat-script.adoc @@ -0,0 +1,124 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Tika-Server REST UAT Script + +A portable shell script that exercises the tika-server REST surface against an +already-running server. The same script is used as the docker image smoke +test, the e2e integration test, and as part of the source-release +verification. + +== Where it lives + +[source] +---- +release-tools/uat/ +├── run-uat.sh # the script +└── test-files/ + ├── testPDF.pdf + ├── testHTML.html + └── test_recursive_embedded.docx +---- + +== What it covers + +Roughly 25 REST endpoint checks across the default-mode endpoints, header +behavior, and error handling — the same surface enumerated in the manual +walkthrough at xref:advanced/integration-testing/tika-server.adoc[Tika-Server +Integration Testing], translated to bash + curl assertions. + +Coverage includes: + +* `/version`, `/parsers`, `/detectors`, `/mime-types` (introspection) +* `/detect/stream` (mime detection) +* `/tika`, `/tika/text`, `/tika/xml`, `/tika/json` (parse) +* `/meta`, `/meta/{field}` (metadata) +* `/rmeta`, `/rmeta/text` (recursive metadata) +* `/unpack/all` (embedded extraction; verifies the response is a valid zip) +* `/language/stream` +* `/meta/form`, `/rmeta/form` (multipart variants) +* `enableUnsecureFeatures=false` gating: `/meta/config`, `/rmeta/config`, + `/tika/config` all return 403 +* `X-Tika-OCRskipOcr` header, `Content-Disposition` filename +* 404 / 405 error handling + +Two checks (T18d, T27) are currently disabled with inline comments pointing +at tika-core behavior anomalies that need fixing — re-enable them when those +land. + +== Running it + +The script takes a URL pointing at a running tika-server. It does *not* start +or stop the server itself. + +[source,bash] +---- +release-tools/uat/run-uat.sh [host] +# default host: http://localhost:9998 +---- + +Exit code: `0` on all-pass, `1` on any failure. Failed checks print the +expected pattern and a truncated response body. + +=== Against the unpacked bin.zip distribution + +[source,bash] +---- +unzip tika-server-standard-<VERSION>-bin.zip -d /tmp/tika-server-dist +cd /tmp/tika-server-dist +java -jar tika-server.jar -p 9998 -h localhost & +sleep 12 +~/path/to/tika/release-tools/uat/run-uat.sh +---- + +=== Against the Docker image + +The `docker-tool.sh test-uat` subcommand wraps starting the container, waiting +for `/version`, running the UAT, and stopping the container: + +[source,bash] +---- +cd tika-server/docker-build +./docker-tool.sh test-uat <DOCKER_VERSION> +---- + +=== As part of the e2e tests (CI) + +The Maven module `tika-e2e-tests/tika-server` unpacks the bin.zip, forks +`java -jar tika-server.jar`, and invokes this script via +`org.apache.tika.server.e2e.RunUatSmokeTest`. The CI workflow +`.github/workflows/main-jdk17-build.yml` runs this automatically on every PR +via `mvn -pl tika-e2e-tests -am clean verify -Pe2e`. + +== When to use it + +* *Pre-vote release verification.* Unpack + `tika-server-standard-<VERSION>-bin.zip` from `dist/dev` and run the UAT + against it. Catches packaging regressions before the vote thread starts. +* *Pre-publish docker verification.* Run via `docker-tool.sh test-uat` after + building a new image and before tagging it for release. +* *Local development sanity check.* When changing anything in + `tika-server-core` or the bin.zip assembly descriptor, run the UAT against + the build output to confirm you didn't regress endpoint behavior. +* *Adding new endpoints.* When a new REST endpoint lands, add a corresponding + check to the script so future regressions get caught. + +== Platform notes + +The script is bash + curl + unzip. It's skipped automatically on Windows by +the e2e test (no bash). On Linux/macOS it runs as-is. No external dependencies +beyond the standard tooling. diff --git a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc index a8f2f8cbc7..c699f00e4a 100644 --- a/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc +++ b/docs/modules/ROOT/pages/maintainers/release-guides/docker.adoc @@ -17,117 +17,179 @@ = Releasing Tika Docker Images -This guide covers the process for releasing Apache Tika Docker images. +This guide covers releasing the official Apache Tika Docker images +(`apache/tika` and `apache/tika-grpc` on Docker Hub). -== Prerequisites +== Where the Dockerfiles live -=== DockerHub Access +Starting with 4.0.0-alpha-1, the Dockerfiles and the GitHub Actions workflow +that publishes them live in this repository: -You need permissions on the `apache/tika` repository on DockerHub. To obtain access, -create an INFRA JIRA ticket with the "Docker" label. +* `tika-server/docker-build/{minimal,full}/Dockerfile` — `apache/tika` (server) release builds +* `tika-server/docker-build/{minimal,full}/Dockerfile.snapshot` — nightly snapshot builds +* `tika-grpc/docker-build/Dockerfile` — `apache/tika-grpc` release builds +* `.github/workflows/docker-release.yml` — the release publishing workflow +* `.github/workflows/docker-snapshot.yml` — the snapshot publishing workflow (auto on push to `main`) -=== Repository Access +NOTE: The legacy https://github.com/apache/tika-docker[apache/tika-docker] +repository is still used for 3.x patch releases — see <<3x-patches>> below. New +4.x work happens here. -Clone the tika-docker repository: +== Image types -[source,bash] ----- -git clone https://github.com/apache/tika-docker -cd tika-docker ----- +minimal:: +Apache Tika server with base dependencies (Java + the unpacked +`tika-server-standard-bin.zip`). -== Image Types +full:: +Adds Tesseract OCR, GDAL, ImageMagick, and Microsoft fonts. -The tika-docker repository produces two types of images: +`apache/tika-grpc`:: +The gRPC server packaged with parser-package jars and pipes plugin zips. -Minimal:: -Apache Tika with base dependencies (Java only) +== Prerequisites -Full:: -Apache Tika plus Tesseract OCR and GDAL +* You have committer permission on `apache/tika` (the GitHub repo). The Docker + release workflow is gated to maintainers via the standard repo permission + model — no separate Docker Hub credential is needed at trigger time; Docker + Hub auth is held by the workflow as a secret. +* The Tika release vote has passed and the artifacts have been moved from + `dist/dev` to `dist/release` (i.e., the bin.zip and parser-package jars are + already on `dlcdn.apache.org`/`downloads.apache.org`). The workflow + downloads those artifacts during the build, so they must be live first. +* The release tag (e.g. `4.0.0-alpha-1`) exists in the repo. `release:perform` + creates it during the upstream release. -== Helper Tools +== Release process -docker-tool.sh:: -Automates building, testing, and publishing Docker images +=== Step 1: Verify the upstream artifacts are live -republish-images.sh:: -Legacy script for batch republishing images +[source,bash] +---- +curl -sLI https://downloads.apache.org/tika/<TAG>/tika-server-standard-<TAG>-bin.zip \ + | head -1 +---- -NOTE: The repository also contains Docker Compose files for advanced scenarios -(Vision, Grobid, OCR, NER), but these are not used for official releases. +If you get a 200, you're ready. If 404, the SVN move from dist/dev to +dist/release hasn't propagated yet — wait a few minutes. -== Release Process +=== Step 2: Trigger the Docker release workflow -=== Step 1: Update README +The workflow is intentionally `workflow_dispatch`-only — it won't auto-fire on +tag push. Trigger it manually after the vote, against the release tag. -Update the "Available Tags" section in `README.md` to include the new version. +*Via the GitHub UI:* -=== Step 2: Update Version +. Open https://github.com/apache/tika/actions +. Select *Docker release - tika-server and tika-grpc* in the left sidebar +. Click *Run workflow* (top-right) +. Under *Use workflow from*, switch from the default branch to *Tags* and pick + the release tag (e.g. `4.0.0-alpha-1`) +. Click *Run workflow* -Increment the TAG version in the `.env` file. +*Via the `gh` CLI:* -=== Step 3: Update Changelog +[source,bash] +---- +gh workflow run docker-release.yml --ref <TAG> +# e.g. +gh workflow run docker-release.yml --ref 4.0.0-alpha-1 +---- -Update `CHANGES.md` with release information and date. +The `--ref` argument selects the git ref to check out. The workflow extracts +the tag name from `GITHUB_REF` (`refs/tags/<TAG>`) and uses it for both the +`TIKA_VERSION` build-arg and the published Docker tag. -=== Step 4: Test Locally +=== Step 3: Watch the run -Test the release locally before publishing: +A successful run takes ~30–45 minutes (multi-arch builds across `linux/amd64`, +`linux/arm64`, `linux/s390x` are slow under qemu emulation, especially the +full image). -[source,bash] ----- -./docker-tool.sh build <docker-version> <tika-version> -./docker-tool.sh test <docker-version> ----- +* GitHub UI: the Actions run page streams logs. +* CLI: `gh run watch` will tail the latest run. + +The workflow does three things: -=== Step 5: Commit Changes +. Builds and pushes `apache/tika:<TAG>` (minimal, multi-arch). +. Builds and pushes `apache/tika:<TAG>-full` (full, multi-arch). +. Builds and pushes `apache/tika-grpc:<TAG>` (multi-arch). -Commit all changes: +=== Step 4: Verify the published images [source,bash] ---- -git add README.md .env CHANGES.md -git commit -m "Prepare for Docker release <docker-version>" -git push +# Confirm the manifest landed: +curl -sL "https://hub.docker.com/v2/repositories/apache/tika/tags/<TAG>/" \ + | python3 -c "import sys,json;d=json.load(sys.stdin);print(d.get('tag_last_pushed'), d.get('digest'))" + +# Smoke-test the image locally: +docker pull apache/tika:<TAG> +docker run --rm -d --name tika-uat -p 127.0.0.1:9998:9998 apache/tika:<TAG> +sleep 12 +curl -s http://localhost:9998/version +docker rm -f tika-uat ---- -=== Step 6: Build and Publish - -Build and publish the images using the docker-tool script. - -Example for version 3.1.0.0 based on Tika 3.1.0: +For a deeper smoke test that exercises the full REST surface, run the +xref:advanced/integration-testing/run-uat-script.adoc[REST UAT script] +(the same one tied into the e2e tests): [source,bash] ---- -# Build the images -./docker-tool.sh build 3.1.0.0 3.1.0 +release-tools/uat/run-uat.sh http://localhost:9998 +---- -# Test the images -./docker-tool.sh test 3.1.0.0 +Both `apache/tika:<TAG>` and `apache/tika:<TAG>-full` should pass. -# Publish to DockerHub -./docker-tool.sh publish 3.1.0.0 3.1.0 ----- +== `:latest` tag policy + +The `apache/tika:latest` and `apache/tika:latest-full` tags currently still +point at the 3.x stable image (the `latest`-tagged 3.3.0 image published from +the external apache/tika-docker repo). -NOTE: Multi-architecture building takes time. The publish step automatically -updates the `-latest` tag on DockerHub. +The release workflow *deliberately does not push `:latest`* for 4.x +alpha/beta/RC builds — those tags stay on 3.x until 4.0.0 GA. When 4.0.0 GA +ships, edit `docker-release.yml` to re-add `apache/tika:latest` and +`apache/tika:latest-full` to the tag lists. -=== Step 7: Tag the Release +`apache/tika-grpc:latest` *is* pushed on every 4.x release — the grpc image is +new in 4.x and has no 3.x incumbent to protect. -Create and push a git tag for the release: +[[3x-patches]] +== 3.x patch releases (legacy path) + +Until 4.0.0 GA, any 3.x patch release (e.g. a 3.3.0.1 with a CVE fix) is +still published from the legacy https://github.com/apache/tika-docker[apache/tika-docker] +repository using its `docker-tool.sh`: [source,bash] ---- -git tag -a 3.1.0.0 -m "New release for 3.1.0.0" +git clone https://github.com/apache/tika-docker +cd tika-docker + +# Edit README.md (Available Tags), CHANGES.md, .env (TAG=...) +# Then commit + push + +./docker-tool.sh build <DOCKER_VERSION> <TIKA_VERSION> +./docker-tool.sh test <DOCKER_VERSION> +./docker-tool.sh publish <DOCKER_VERSION> <TIKA_VERSION> + +git tag -a <DOCKER_VERSION> -m "New release for <DOCKER_VERSION>" git push --tags ---- -== Post-Release +Use the 3.x convention `<TIKA_VERSION>.<DOCKER_BUILD_NUMBER>` (e.g. +`3.3.0.1` for the first Docker rebuild on top of Tika 3.3.0). 4.x releases +drop that scheme and publish bare `<TIKA_VERSION>` only. + +== Post-release -After publishing the Docker images: +After the workflow completes: -* Verify the images are available on DockerHub at https://hub.docker.com/r/apache/tika -* Test pulling and running the new images -* Update the main Tika website if needed -* Proceed to release the link:helm.html[Helm charts] if applicable +* Verify both images on https://hub.docker.com/r/apache/tika and + https://hub.docker.com/r/apache/tika-grpc. +* Test pulling and running the new images from a clean machine. +* If applicable, proceed to xref:maintainers/release-guides/helm.adoc[release the Helm charts]. +* Update news/announcement copy on the main Tika website if it references the + Docker images. diff --git a/tika-server/docker-build/CHANGES.md b/tika-server/docker-build/CHANGES.md new file mode 100644 index 0000000000..eb6ce314a2 --- /dev/null +++ b/tika-server/docker-build/CHANGES.md @@ -0,0 +1,74 @@ +# Changes + +As of 2.5.0.1, we started adding a digit for Docker versions. Going forward, we'll include +a four digit version, where the first three are the Tika version and the last one is the docker version. +As of 2.5.0.2, we started tagging release commits in our github repo. + +* 4.0.0-alpha-1.0 (9 May 2026) + * First 4.0.0-alpha-1 release (preview; not tagged `latest`) + * Dropped `linux/arm/v7` from the published platforms. 32-bit ARM emulated + builds on Ubuntu 26.04 (resolute) hit a qemu chown-overflow in + `update-notifier-common`'s postinst, which is pulled in by + `ttf-mscorefonts-installer`. `linux/arm64/v8` covers modern ARM. + +* 3.3.0.0 (23 Mar 2026) + * First 3.3.0 release + +* 3.2.3.0 (15 Sep 2025) + * First 3.2.3 release + +* 3.2.2.0 (8 Aug 2025) + * First 3.2.2 release + +* 3.2.1.0 (9 Jul 2025) + * First 3.2.1 release + +* 3.2.0.0 (2 Jun 2025) + * First 3.2.0 release + * Update base to plucky + * Add Japanese language pack for tesseract + * Add ImageMagick + +* 3.1.0.0 (31 Jan 2025) + * First 3.1.0 release + * Update base to oracular + +* 3.0.0.0 (21 Oct 2024) + * First 3.x stable release + * Bump jre to 21 + +* 2.9.2.1 (21 May 2024) + * Updated to noble + * First multi-arch release + +* 2.9.2.0 (10 October 2023) + * Initial release for Tika 2.9.2 + +* 2.9.1.0 (10 October 2023) + * Initial release for Tika 2.9.1 + +* 2.9.0.0 (28 August 2023) + * Initial release for Tika 2.9.0 + +* 2.8.0.0 (15 May 2023) + * Initial release for Tika 2.8.0 + + +* 2.7.0.1 (27 March 2023) + * More efficient build process and final image size via @stumpylog on [pr#17](https://github.com/apache/tika-docker/pull). + +* 2.7.0.0 (6 Feb 2023) + * Initial release for Tika 2.7.0 + +* 2.6.0.1 (10 November 2022) + * Update operating system against OpenSSL CVE (TIKA-3926). + +* 2.6.0.0 (7 November 2022) + * Initial release for Tika 2.6.0 + +* 2.5.0.2 (31 October 2022) + * Fixed root-user regression caused by differences in Docker behavior based on the build system's OS (TIKA-3912) + * Added tika-extras/ directory to pick up extra jars via mounted drive or for those using our image as a base image (TIKA-3907) +* +* 2.5.0.1 (27 October 2022) + * Update to latest jammy to avoid recent CVEs (TIKA-3906) \ No newline at end of file diff --git a/tika-server/docker-build/README.md b/tika-server/docker-build/README.md new file mode 100644 index 0000000000..05b874a075 --- /dev/null +++ b/tika-server/docker-build/README.md @@ -0,0 +1,254 @@ +# tika-docker <!--- update this once we migrate to github actions(?) [](https://travis-ci.com/github/apache/tika-docker) --> + +This repo is used to create convenience Docker images for Apache Tika Server published as [apache/tika](https://hub.docker.com/r/apache/tika) on DockerHub by the [Apache Tika](http://tika.apache.org) Dev team + +The images create a functional Apache Tika Server instance that contains the latest Ubuntu running the appropriate version's server on Port 9998 using Java 8 (until version 1.20), Java 11 (1.21 and 1.24.1), Java 14 (until 1.27/2.0.0), Java 16 (for 2.1.0), and Java 17 LTS for newer versions. + +There is a minimal version, which contains only Apache Tika and it's core dependencies, and a full version, which also includes dependencies for the GDAL and Tesseract OCR parsers. To balance showing functionality versus the size of the full image, this file by default installs the language packs for the following languages: +* English +* French +* German +* Italian +* Spanish +* Japanese + +To install more languages, set the build argument `LANGUAGES` or include your own custom packs using an ADD command. + +## Available Tags + +Below are the most recent tags. The `latest` tags track the 3.x stable line; +4.x preview releases are published as version-specific tags only. +- `latest`, `3.3.0.0`: Apache Tika Server 3.3.0.0 (Minimal) +- `latest-full`, `3.3.0.0-full`: Apache Tika Server 3.3.0.0 (Full) +- `4.0.0-alpha-1.0`: Apache Tika Server 4.0.0-alpha-1.0 (Minimal, 4.x preview) +- `4.0.0-alpha-1.0-full`: Apache Tika Server 4.0.0-alpha-1.0 (Full, 4.x preview) +- `3.3.0.0`, `3.3.0.0`: Apache Tika Server 3.3.0.0 (Minimal) +- `3.3.0.0`, `3.3.0.0-full`: Apache Tika Server 3.3.0.0 (Full) +- `3.2.3.0`, `3.2.3.0`: Apache Tika Server 3.2.3.0 (Minimal) +- `3.2.3.0`, `3.2.3.0-full`: Apache Tika Server 3.2.3.0 (Full) +- `3.2.2.0`, `3.2.2.0`: Apache Tika Server 3.2.2.0 (Minimal) +- `3.2.2.0`, `3.2.2.0-full`: Apache Tika Server 3.2.2.0 (Full) +- `3.2.1.0`, `3.2.1.0`: Apache Tika Server 3.2.1.0 (Minimal) +- `3.2.1.0`, `3.2.1.0-full`: Apache Tika Server 3.2.1.0 (Full) +- `3.2.0.0`, `3.2.0.0`: Apache Tika Server 3.2.0.0 (Minimal) +- `3.2.0.0`, `3.2.0.0-full`: Apache Tika Server 3.2.0.0 (Full) +- `3.1.0.0`, `3.1.0.0`: Apache Tika Server 3.1.0.0 (Minimal) +- `3.1.0.0`, `3.1.0.0-full`: Apache Tika Server 3.1.0.0 (Full) +- `3.0.0.0`, `3.0.0.0`: Apache Tika Server 3.0.0.0 (Minimal) +- `3.0.0.0`, `3.0.0.0-full`: Apache Tika Server 3.0.0.0 (Full) +- `3.0.0.0-BETA2`, `3.0.0.0-BETA2`: Apache Tika Server 3.0.0.0-BETA2 (Minimal) +- `3.0.0.0-BETA2`, `3.0.0.0-BETA2-full`: Apache Tika Server 3.0.0.0-BETA2 (Full) +- `2.9.2.1`, `2.9.2.1`: Apache Tika Server 2.9.2.1 (Minimal) +- `2.9.2.1`, `2.9.2.1-full`: Apache Tika Server 2.9.2.1 (Full) +- `2.9.2.0`, `2.9.2.0`: Apache Tika Server 2.9.2.0 (Minimal) +- `2.9.2.0`, `2.9.2.0-full`: Apache Tika Server 2.9.2.0 (Full) +- `2.9.1.0`, `2.9.1.0`: Apache Tika Server 2.9.1.0 (Minimal) +- `2.9.1.0`, `2.9.1.0-full`: Apache Tika Server 2.9.1.0 (Full) +- `2.9.0.0`, `2.9.0.0`: Apache Tika Server 2.9.0.0 (Minimal) +- `2.9.0.0`, `2.9.0.0-full`: Apache Tika Server 2.9.0.0 (Full) +- `2.8.0.0`, `2.8.0.0`: Apache Tika Server 2.8.0.0 (Minimal) +- `2.8.0.0`, `2.8.0.0-full`: Apache Tika Server 2.8.0.0 (Full) +- `2.7.0.1`, `2.7.0.1`: Apache Tika Server 2.7.0.1 (Minimal) +- `2.7.0.1`, `2.7.0.1-full`: Apache Tika Server 2.7.0.1 (Full) +- `2.7.0.0`, `2.7.0.0`: Apache Tika Server 2.7.0.0 (Minimal) +- `2.7.0.0`, `2.7.0.0-full`: Apache Tika Server 2.7.0.0 (Full) +- `2.6.0.1`: Apache Tika Server 2.6.0.1 (Minimal) +- `2.6.0.1-full`: Apache Tika Server 2.6.0.1 (Full) +- `2.6.0.0`: Apache Tika Server 2.6.0.0 (Minimal) +- `2.6.0.0-full`: Apache Tika Server 2.6.0.0 (Full) +- `2.5.0.2`: Apache Tika Server 2.5.0.2 (Minimal) +- `2.5.0.2-full`: Apache Tika Server 2.5.0.2 (Full) +- `2.5.0.1`: Apache Tika Server 2.5.0.1 (Minimal) +- `2.5.0.1-full`: Apache Tika Server 2.5.0.1 (Full) +- `2.5.0`: Apache Tika Server 2.5.0 (Minimal) +- `2.5.0-full`: Apache Tika Server 2.5.0 (Full) +- `2.4.1`: Apache Tika Server 2.4.1 (Minimal) +- `2.4.1-full`: Apache Tika Server 2.4.1 (Full) +- `2.4.0`: Apache Tika Server 2.4.0 (Minimal) +- `2.4.0-full`: Apache Tika Server 2.4.0 (Full) +- `2.3.0`: Apache Tika Server 2.3.0 (Minimal) +- `2.3.0-full`: Apache Tika Server 2.3.0 (Full) +- `2.2.1`: Apache Tika Server 2.2.1 (Minimal) +- `2.2.1-full`: Apache Tika Server 2.2.1 (Full) + +Below are the most recent 1.x series tags. **Note** that as of 30 September 2022, the 1.x branch is no longer supported. + +- `1.28.5`: Apache Tika Server 1.28.5 (Minimal) +- `1.28.5-full`: Apache Tika Server 1.28.5 (Full) +- `1.28.4`: Apache Tika Server 1.28.4 (Minimal) +- `1.28.4-full`: Apache Tika Server 1.28.4 (Full) +- `1.28.3`: Apache Tika Server 1.28.3 (Minimal) +- `1.28.3-full`: Apache Tika Server 1.28.3 (Full) +- `1.28.2`: Apache Tika Server 1.28.2 (Minimal) +- `1.28.2-full`: Apache Tika Server 1.28.2 (Full) +- `1.28.1`: Apache Tika Server 1.28.1 (Minimal) +- `1.28.1-full`: Apache Tika Server 1.28.1 (Full) + +You can see a full set of tags for historical versions [here](https://hub.docker.com/r/apache/tika/tags?page=1&ordering=last_updated). + +## 4.x Preview Notes + +The `4.0.0-alpha-1.0` images are a preview of the upcoming Tika 4.x line and are +not tagged `latest`. + +Tika 4.x changed the `tika-server-standard` packaging: the published jar is now +a thin top-level jar that resolves its dependencies from a sibling `lib/` +directory. The 4.x image therefore ships the unpacked `tika-server-standard-bin.zip` +distribution under `/opt/tika-server/` (containing `tika-server.jar`, `lib/`, +and `plugins/`) instead of a single fat jar. + +The standard REST endpoints (`/tika`, `/rmeta`, `/unpack`, `/detect`, etc.) +work as in 3.x — they spool the request body to a temp file internally via +`TikaInputStream` and do not require any pipes plugin. + +Pipes-mode endpoints (`/pipes`, `/async`) require pf4j plugins. The +`tika-pipes-file-system` plugin is **bundled** under +`/opt/tika-server/plugins/tika-pipes-file-system/` (it ships inside the +upstream `tika-server-standard-bin.zip`). Other pipes plugins +(`tika-pipes-http`, `tika-pipes-s3`, etc.) are not currently bundled in the +preview image; mount them into `/opt/tika-server/plugins/` if you need them. +Bundling additional common plugins is planned for `4.0.0-beta-1.0`. + +## Supported Platforms + +The Docker images are published as multi-platform images supporting the following architectures: + +- `linux/amd64` - 64-bit x86 processors (Intel/AMD) +- `linux/arm64/v8` - 64-bit ARM processors (Apple Silicon, AWS Graviton, etc.) +- `linux/s390x` - IBM System z mainframes + +NOTE: `linux/arm/v7` was published for 3.x but dropped starting with `4.0.0-alpha-1.0`. +If you need 32-bit ARM, pin to a 3.x tag. The drop was driven by a qemu/dpkg +emulation bug that broke font-package installation on the Ubuntu 26.04 base. + +Docker will automatically pull the correct image for your platform when you use `docker pull` or `docker run`. + +## Usage + +### Default + +You can pull down the version you would like using: + + docker pull apache/tika:<tag> + +Then to run the container, execute the following command: + + docker run -d -p 127.0.0.1:9998:9998 apache/tika:<tag> + +Where <tag> is the DockerHub tag corresponding to the Apache Tika Server version - e.g. 1.23, 1.22, 1.23-full, 1.22-full. + +NOTE: The latest and latest-full tags are explicitly set to the latest released version when they are published. + +NOTE: In the example above, we recommend binding the server to localhost because Docker alters iptables and may expose +your tika-server to the internet. If you are confident that your tika-server is on an isolated network +you can simply run: + + docker run -d -p 9998:9998 apache/tika:<tag> + +### Custom Config + +From version 1.25 and 1.25-full of the image it is now easier to override the defaults and pass parameters to the running instance. + +So for example if you wish to disable the OCR parser in the full image you could write a custom configuration: + +``` +cat <<EOT >> tika-config.json +{ + "parsers": [ + { "default-parser": {} }, + { "tesseract-ocr-parser": { "skipOcr": true } } + ] +} +EOT +``` +Then by mounting this custom configuration as a volume, you could pass the command line parameter to load it + + docker run -d -p 127.0.0.1:9998:9998 -v `pwd`/tika-config.json:/tika-config.json apache/tika:<tag>-full -c /tika-config.json + +NOTE: Tika 4.x replaced the XML `tika-config.xml` format with JSON +`tika-config.json` (see TIKA-4544). The XML form above is what 2.x / 3.x +images expect; if you're pinned to those tags, keep using the XML. + +You can see more configuration examples on the +[Tika website](https://tika.apache.org/) and in the canonical samples under +`tika-server/tika-server-core/src/test/resources/config-examples/` in the +source tree. + +As of 2.5.0.2, if you'd like to add extra jars from your local `my-jars` directory to Tika's classpath, mount to `/tika-extras` like so: + + docker run -d -p 127.0.0.1:9998:9998 -v `pwd`/my-jars:/tika-extras apache/tika:2.5.0.2-full + +You may want to do this to add optional components, such as the tika-eval metadata filter, or optional +dependencies such as jai-imageio-jpeg2000 (check license compatibility first!). + +### Docker Compose Examples + +There are a number of sample Docker Compose files included in the repos to allow you to test some different scenarios. + +These files use docker-compose 3.x series and include: + +* docker-compose-tika-vision.yml - Vision-Language Model parsing example (OpenAI-compatible / Claude / Gemini) +* docker-compose-tika-grobid.yml - Grobid REST parsing example +* docker-compose-tika-customocr.yml - Tesseract OCR example with custom configuration + +The Docker Compose files and configurations (sourced from _sample-configs_ directory) all have comments in them so you can try different options, or use them as a base to create your own custom configuration. + +**N.B.** You will want to create a environment variable (used in some bash scripts) matching the version of tika-docker you want to work with in the docker compositions e.g. `export TAG=1.26`. Similarly you should also consult `.env` which is used in the docker-compose `.yml` files. + +You can install docker-compose from [here](https://docs.docker.com/compose/install/). + +## Building + +To build the image from scratch, simply invoke: + + docker build -t 'apache/tika' github.com/apache/tika-docker + +You can then use the following command (using the name you allocated in the build command as part of -t option): + + docker run -d -p 127.0.0.1:9998:9998 apache/tika + +## More Information + +For more infomation on Apache Tika Server, go to the [Apache Tika Server documentation](https://cwiki.apache.org/confluence/display/TIKA/TikaServer). + +For more information on Apache Tika, go to the official [Apache Tika](http://tika.apache.org) project website. + +To meet up with others using Apache Tika, consider coming to one of the [Apache Tika Virtual Meetups](https://www.meetup.com/apache-tika-community/). + +For more information on the Apache Software Foundation, go to the [Apache Software Foundation](http://apache.org) website. + +For a full list of changes as of 2.5.0.1, visit [CHANGES.md](CHANGES.md). + +For our current release process, visit [tika-docker Release Process](https://cwiki.apache.org/confluence/display/TIKA/Release+Process+for+tika-docker) + +## Authors + +Apache Tika Dev Team ([email protected]) + +## Contributors + +There have been a range of [contributors](https://github.com/apache/tika-docker/graphs/contributors) on GitHub and via suggestions, including: + +- [@grossws](https://github.com/grossws) +- [@arjunyel](https://github.com/arjunyel) +- [@mpdude](https://github.com/mpdude) +- [@laszlocsontosuw](https://github.com/laszlocsontosuw) +- [@tallisonapache](https://github.com/tballison) + +## License + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +## Disclaimer + +It is worth noting that whilst these Docker images download the binary JARs published by the Apache Tika Team on the Apache Software Foundation distribution sites, only the source release of an Apache Software Foundation project is an official release artefact. See [Release Distribution Policy](https://www.apache.org/dev/release-distribution.html) for more details. diff --git a/tika-server/docker-build/docker-compose-tika-customocr.yml b/tika-server/docker-build/docker-compose-tika-customocr.yml new file mode 100644 index 0000000000..29cf667a21 --- /dev/null +++ b/tika-server/docker-build/docker-compose-tika-customocr.yml @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.8" +services: + + ## Apache Tika Server + tika: + image: apache/tika:${TAG}-full + # Override default so we can add the /customocr dir on the classpath + # (for the bundled TesseractOCRConfig.properties). The 4.x image layout + # places the thin server jar at /opt/tika-server/tika-server.jar and its + # deps at /opt/tika-server/lib/*. working_dir=/opt/tika-server matters for + # tika-server's plugin-roots fallback (see TikaServerProcess#resolveDefaultPluginsDir). + entrypoint: [ "/bin/sh", "-c", "exec java -cp \"/customocr:/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $$0 $$@"] + working_dir: /opt/tika-server + # Kept command as example but could be added to entrypoint too + command: -c /tika-config.json + restart: on-failure + ports: + - "9998:9998" + volumes: + # Choose the configuration you want, or add your own custom one + # - ./sample-configs/customocr/tika-config-inline.json:/tika-config.json + - ./sample-configs/customocr/tika-config-rendered.json:/tika-config.json + + diff --git a/tika-server/docker-build/docker-compose-tika-grobid.yml b/tika-server/docker-build/docker-compose-tika-grobid.yml new file mode 100644 index 0000000000..add5d2744f --- /dev/null +++ b/tika-server/docker-build/docker-compose-tika-grobid.yml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.8" +services: + + ## Apache Tika Server + tika: + image: apache/tika:${TAG}-full + # Override default so we can add the /grobid dir on the classpath + # (for the bundled GrobidExtractor.properties). The 4.x image layout + # places the thin server jar at /opt/tika-server/tika-server.jar and its + # deps at /opt/tika-server/lib/*. working_dir=/opt/tika-server matters for + # tika-server's plugin-roots fallback. + entrypoint: [ "/bin/sh", "-c", "exec java -cp \"/grobid:/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $$0 $$@"] + working_dir: /opt/tika-server + # Kept command as example but could be added to entrypoint too + command: -c /grobid/tika-config.json + restart: on-failure + ports: + - "9998:9998" + volumes: + - ./sample-configs/grobid:/grobid + depends_on: + - grobid + + ## Grobid Service + grobid: + image: lfoppiano/grobid:0.6.1 + ports: + - "8070:8070" + - "8071:8071" + diff --git a/tika-server/docker-build/docker-compose-tika-vision.yml b/tika-server/docker-build/docker-compose-tika-vision.yml new file mode 100644 index 0000000000..da01d03a27 --- /dev/null +++ b/tika-server/docker-build/docker-compose-tika-vision.yml @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Vision-Language Model parsing for tika-server (Tika 4.x). +# +# The pre-4.x inception-rest / Im2txt / inception-video services and the +# org.apache.tika.parser.recognition.ObjectRecognitionParser they served +# have been removed (TIKA-4499 / TIKA-4500). The 4.x replacement is a +# family of VLM parsers (OpenAI-compatible, Anthropic Claude, Google +# Gemini). See: +# +# docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc +# +# This compose demonstrates the OpenAI-compatible variant pointing at a +# locally-hosted Ollama instance. To use a different VLM: +# - Swap the mounted tika-config.* for vlm-claude.json or vlm-gemini.json +# and pass the relevant API key via env (ANTHROPIC_API_KEY / +# GEMINI_API_KEY). +# - Drop the vlm-server service block below. + +services: + + ## Apache Tika Server + tika: + image: apache/tika:latest-full + command: -c /tika-config.json + restart: on-failure + ports: + - "9998:9998" + volumes: + - ./sample-configs/vision/vlm-openai.json:/tika-config.json + # - ./sample-configs/vision/vlm-claude.json:/tika-config.json + # - ./sample-configs/vision/vlm-gemini.json:/tika-config.json + depends_on: + - vlm-server + + ## Local OpenAI-compatible VLM endpoint. + ## Replace with vLLM, your own FastAPI wrapper, or remove and point + ## baseUrl in vlm-openai.json at OpenAI's real API. + vlm-server: + image: ollama/ollama:latest + ports: + - "8000:11434" + # Volumes for pulled models. Uncomment and pull a vision-capable model + # (e.g. `docker exec <container> ollama pull llava`) before first use. + # volumes: + # - ollama-models:/root/.ollama + +# volumes: +# ollama-models: diff --git a/tika-server/docker-build/docker-tool.sh b/tika-server/docker-build/docker-tool.sh index 2a82b5fa34..db05dddf2e 100755 --- a/tika-server/docker-build/docker-tool.sh +++ b/tika-server/docker-build/docker-tool.sh @@ -36,6 +36,8 @@ while getopts ":h" opt; do echo " docker-tool.sh -h Display this help message." echo " docker-tool.sh build <TIKA_DOCKER_VERSION> <TIKA_VERSION> Builds <TIKA_DOCKER_VERSION> images for <TIKA_VERSION>." echo " docker-tool.sh test <TIKA_DOCKER_VERSION> Tests images for <TIKA_DOCKER_VERSION>." + echo " docker-tool.sh test-uat <TIKA_DOCKER_VERSION> Runs the tika-server REST UAT against images for <TIKA_DOCKER_VERSION>." + echo " Requires TIKA_MAIN env var or sibling tika-main checkout (../tika-main)." echo " docker-tool.sh publish <TIKA_DOCKER_VERSION> <TIKA_VERSION> Builds multi-arch images for <TIKA_DOCKER_VERSION> and pushes to Docker Hub." exit 0 ;; @@ -98,6 +100,35 @@ test_docker_image() { stop_test_container "$container_name" } +test_docker_image_uat() { + container_name=$1 + image=$image_name:$1 + uat_script=$2 + + docker run -d --name "$container_name" -p 127.0.0.1:9998:9998 "$image" \ + || die "couldn't start $image" + + # Wait up to 30s for /version to respond. + for i in $(seq 1 30); do + if curl -fsS --max-time 2 http://localhost:9998/version >/dev/null 2>&1; then + break + fi + sleep 1 + done + + if "$uat_script" http://localhost:9998; then + echo "$(tput setaf 2)Image: $image - UAT passed$(tput sgr0)" + stop_test_container "$container_name" + else + echo "$(tput setaf 1)Image: $image - UAT failed$(tput sgr0)" + echo "--- last 40 lines of container log ---" + docker logs --tail 40 "$container_name" || true + echo "--- end log ---" + stop_test_container "$container_name" + exit 1 + fi +} + shift $((OPTIND -1)) subcommand=$1; shift tika_docker_version=$1; shift @@ -118,13 +149,25 @@ case "$subcommand" in test_docker_image "${tika_docker_version}-full" true ;; + test-uat) + # Run the tika-server REST UAT (release-tools/uat/run-uat.sh, two levels + # up from this script in the tika repo) against both images. + repo_root="$(cd "$(dirname "$0")/../.." && pwd)" + uat_script="${repo_root}/release-tools/uat/run-uat.sh" + if [[ ! -x "$uat_script" ]]; then + die "UAT script not found or not executable: $uat_script" + fi + test_docker_image_uat ${tika_docker_version} "$uat_script" + test_docker_image_uat "${tika_docker_version}-full" "$uat_script" + ;; + publish) docker buildx create --use --name tika-builder || die "couldn't create builder" # Build multi-arch with buildx and push - docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ - --tag ${image_name}:latest --tag ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder minimal || stop_and_die "couldn't build multi-arch minimal" - docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ - --tag ${image_name}:latest-full --tag ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder full || stop_and_die "couldn't build multi-arch full" + docker buildx build --platform linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder minimal || stop_and_die "couldn't build multi-arch minimal" + docker buildx build --platform linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder full || stop_and_die "couldn't build multi-arch full" docker buildx rm tika-builder || die "couldn't stop builder -- make sure to stop the builder manually! " ;; diff --git a/tika-server/docker-build/full/Dockerfile b/tika-server/docker-build/full/Dockerfile index 1b918390f6..7c77e4a048 100644 --- a/tika-server/docker-build/full/Dockerfile +++ b/tika-server/docker-build/full/Dockerfile @@ -15,36 +15,42 @@ # the subsequent stages -- see TIKA-3912 ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS base +FROM ubuntu:resolute AS base FROM base AS fetch_tika ARG TIKA_VERSION ARG CHECK_SIG=true -ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ - ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ +ENV TIKA_SERVER_ARCHIVE="tika-server-standard-${TIKA_VERSION}-bin.zip" \ + NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ TIKA_VERSION=$TIKA_VERSION -RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install gnupg2 wget ca-certificates \ +# 4.x publishes tika-server as a bin.zip distribution. The thin top-level +# tika-server.jar uses its manifest Class-Path to resolve the jars under lib/, +# and tika-server reads pf4j plugins from the plugins/ directory next to it. +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install gnupg2 wget ca-certificates unzip \ && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ - && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ - && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ - && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar - -#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $BACKUP_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || exit 1 \ + && gpg --verify /${TIKA_SERVER_ARCHIVE}.asc /${TIKA_SERVER_ARCHIVE} \ + && mkdir -p /opt/tika-server \ + && unzip -q /${TIKA_SERVER_ARCHIVE} -d /opt/tika-server \ + && rm /${TIKA_SERVER_ARCHIVE} /${TIKA_SERVER_ARCHIVE}.asc FROM base AS runtime ARG UID_GID -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' +ARG LANGUAGES='eng ita fra spa deu jpn' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ @@ -53,12 +59,7 @@ RUN set -eux \ gdal-bin \ imagemagick \ tesseract-ocr \ - tesseract-ocr-eng \ - tesseract-ocr-ita \ - tesseract-ocr-fra \ - tesseract-ocr-spa \ - tesseract-ocr-deu \ - tesseract-ocr-jpn \ + $(printf 'tesseract-ocr-%s ' $LANGUAGES) \ && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ xfonts-utils \ @@ -72,11 +73,17 @@ RUN set -eux \ ARG TIKA_VERSION ENV TIKA_VERSION=$TIKA_VERSION -COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +COPY --from=fetch_tika /opt/tika-server /opt/tika-server +# WORKDIR sets the CWD so tika-server's plugin-root fallback resolves +# `plugins/` relative to /opt/tika-server (its `getCodeSource()` returns a +# lib/* path, not the top-level jar, so the "next-to-jar" resolution misses). +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +# Classpath includes the thin server jar, its lib/ deps, and any user-mounted /tika-extras/. +# tika-server auto-discovers pf4j plugins from /opt/tika-server/plugins/. +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/full/Dockerfile.snapshot b/tika-server/docker-build/full/Dockerfile.snapshot index 4f655005e6..03bcc08e41 100644 --- a/tika-server/docker-build/full/Dockerfile.snapshot +++ b/tika-server/docker-build/full/Dockerfile.snapshot @@ -15,10 +15,10 @@ ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS runtime +FROM ubuntu:resolute AS runtime ARG UID_GID ARG TIKA_VERSION -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ @@ -44,9 +44,13 @@ RUN set -eux \ && apt-get clean -y \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ENV TIKA_VERSION=$TIKA_VERSION -COPY tika-server/ /tika-server/ +# Snapshot workflow tars the bin distribution into <context>/tika-server/, so +# this COPY lands the thin jar + lib/ + plugins/ at /opt/tika-server/, matching +# the release-variant Dockerfile. +COPY tika-server/ /opt/tika-server/ +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/minimal/Dockerfile b/tika-server/docker-build/minimal/Dockerfile index 1c5195920a..af641a491d 100644 --- a/tika-server/docker-build/minimal/Dockerfile +++ b/tika-server/docker-build/minimal/Dockerfile @@ -16,44 +16,48 @@ # the subsequent stages -- see TIKA-3912 ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS base +FROM ubuntu:resolute AS base FROM base AS fetch_tika ARG TIKA_VERSION ARG CHECK_SIG=true -ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ - DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ - ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ +ENV TIKA_SERVER_ARCHIVE="tika-server-standard-${TIKA_VERSION}-bin.zip" \ + NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}-bin.zip.asc" \ TIKA_VERSION=$TIKA_VERSION +# 4.x publishes tika-server as a bin.zip distribution. The thin top-level +# tika-server.jar uses its manifest Class-Path to resolve the jars under lib/, +# and tika-server reads pf4j plugins from the plugins/ directory next to it. RUN set -eux \ && apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ gnupg2 \ wget \ ca-certificates \ + unzip \ && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ - && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ - && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ - && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ - && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar - -# this used to work, but I'm getting "ERROR: failed to solve: failed to prepare $data as $data2: invalid argument" -# when trying to build 2.9.2.0 -#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || wget $BACKUP_TIKA_SERVER_URL -O /${TIKA_SERVER_ARCHIVE} || rm /${TIKA_SERVER_ARCHIVE} \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE} ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /${TIKA_SERVER_ARCHIVE}.asc || rm /${TIKA_SERVER_ARCHIVE}.asc \ + && sh -c "[ -f /${TIKA_SERVER_ARCHIVE}.asc ]" || exit 1 \ + && gpg --verify /${TIKA_SERVER_ARCHIVE}.asc /${TIKA_SERVER_ARCHIVE} \ + && mkdir -p /opt/tika-server \ + && unzip -q /${TIKA_SERVER_ARCHIVE} -d /opt/tika-server \ + && rm /${TIKA_SERVER_ARCHIVE} /${TIKA_SERVER_ARCHIVE}.asc FROM base AS runtime # must reference uid_gid ARG UID_GID -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends \ @@ -62,9 +66,15 @@ RUN set -eux \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ARG TIKA_VERSION ENV TIKA_VERSION=$TIKA_VERSION -COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +COPY --from=fetch_tika /opt/tika-server /opt/tika-server +# WORKDIR sets the CWD so tika-server's plugin-root fallback resolves +# `plugins/` relative to /opt/tika-server (its `getCodeSource()` returns a +# lib/* path, not the top-level jar, so the "next-to-jar" resolution misses). +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +# Classpath includes the thin server jar, its lib/ deps, and any user-mounted /tika-extras/. +# tika-server auto-discovers pf4j plugins from /opt/tika-server/plugins/. +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/minimal/Dockerfile.snapshot b/tika-server/docker-build/minimal/Dockerfile.snapshot index d701dfee68..873ee64f56 100644 --- a/tika-server/docker-build/minimal/Dockerfile.snapshot +++ b/tika-server/docker-build/minimal/Dockerfile.snapshot @@ -15,10 +15,10 @@ ARG UID_GID="35002:35002" -FROM ubuntu:plucky AS runtime +FROM ubuntu:resolute AS runtime ARG UID_GID ARG TIKA_VERSION -ARG JRE='openjdk-21-jre-headless' +ARG JRE='openjdk-25-jre-headless' RUN set -eux \ && apt-get update \ && apt-get install --yes --no-install-recommends \ @@ -26,9 +26,13 @@ RUN set -eux \ ca-certificates \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ENV TIKA_VERSION=$TIKA_VERSION -COPY tika-server/ /tika-server/ +# Snapshot workflow tars the bin distribution into <context>/tika-server/, so +# this COPY lands the thin jar + lib/ + plugins/ at /opt/tika-server/, matching +# the release-variant Dockerfile. +COPY tika-server/ /opt/tika-server/ +WORKDIR /opt/tika-server USER $UID_GID EXPOSE 9998 -ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/opt/tika-server/tika-server.jar:/opt/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] LABEL maintainer="Apache Tika Developers [email protected]" diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.json b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.json new file mode 100644 index 0000000000..055e72c9ba --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.json @@ -0,0 +1,11 @@ +{ + "_comment": "Extract inline images from PDF and OCR them with Tesseract.", + "parsers": [ + { "tesseract-ocr-parser": {} }, + { + "pdf-parser": { + "extractInlineImages": true + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml deleted file mode 100644 index 1c9b613033..0000000000 --- a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml +++ /dev/null @@ -1,31 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> -<properties> - <parsers> - <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) --> - <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - - <!-- Extract and OCR Inline Images in PDF --> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractInlineImages" type="bool">true</param> - </params> - </parser> - - </parsers> -</properties> diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json new file mode 100644 index 0000000000..45f3d3bf72 --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.json @@ -0,0 +1,16 @@ +{ + "_comment": [ + "Render each PDF page as an image and run Tesseract on it.", + "ocrStrategy options: no_ocr, ocr_only, ocr_and_text, auto." + ], + "parsers": [ + { "tesseract-ocr-parser": {} }, + { + "pdf-parser": { + "ocrStrategy": "ocr_only", + "ocrImageType": "rgb", + "ocrDPI": 100 + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml deleted file mode 100644 index bcd8666996..0000000000 --- a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml +++ /dev/null @@ -1,38 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> -<properties> - <parsers> - <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) --> - <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/> - - <!-- OCR on Rendered Pages --> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <!-- no_ocr - extract text only - ocr_only - don't extract text and just attempt OCR - ocr_and_text - extract text and attempt OCR (from Tika 1.24) - auto - extract text but if < 10 characters try OCR - --> - <param name="ocrStrategy" type="string">ocr_only</param> - <param name="ocrImageType" type="string">rgb</param> - <param name="ocrDPI" type="int">100</param> - </params> - </parser> - - </parsers> -</properties> diff --git a/tika-server/docker-build/sample-configs/grobid/tika-config.json b/tika-server/docker-build/sample-configs/grobid/tika-config.json new file mode 100644 index 0000000000..943ec19528 --- /dev/null +++ b/tika-server/docker-build/sample-configs/grobid/tika-config.json @@ -0,0 +1,10 @@ +{ + "_comment": "Route PDFs through GROBID (via JournalParser) for journal-article extraction.", + "parsers": [ + { + "journal-parser": { + "_mime-include": ["application/pdf"] + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/grobid/tika-config.xml b/tika-server/docker-build/sample-configs/grobid/tika-config.xml deleted file mode 100644 index 5b4aad9c72..0000000000 --- a/tika-server/docker-build/sample-configs/grobid/tika-config.xml +++ /dev/null @@ -1,24 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> -<properties> - <parsers> - <parser class="org.apache.tika.parser.journal.JournalParser"> - <mime>application/pdf</mime> - </parser> - </parsers> -</properties> diff --git a/tika-server/docker-build/sample-configs/ner/run_tika_server.sh b/tika-server/docker-build/sample-configs/ner/run_tika_server.sh deleted file mode 100755 index fb447be4cf..0000000000 --- a/tika-server/docker-build/sample-configs/ner/run_tika_server.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -############################################################################# -# See https://cwiki.apache.org/confluence/display/TIKA/TikaAndNER for details -# on how to configure additional NER libraries -############################################################################# - -# ------------------------------------ -# Download OpenNLP Models to classpath -# ------------------------------------ - -OPENNLP_LOCATION="/ner/org/apache/tika/parser/ner/opennlp" -URL="http://opennlp.sourceforge.net/models-1.5" - -mkdir -p $OPENNLP_LOCATION -if [ "$(ls -A $OPENNLP_LOCATION/*.bin)" ]; then - echo "OpenNLP models directory has files, so skipping fetch"; -else - echo "No OpenNLP models found, so fetching them" - wget "$URL/en-ner-person.bin" -O $OPENNLP_LOCATION/ner-person.bin - wget "$URL/en-ner-location.bin" -O $OPENNLP_LOCATION/ner-location.bin - wget "$URL/en-ner-organization.bin" -O $OPENNLP_LOCATION/ner-organization.bin; - wget "$URL/en-ner-date.bin" -O $OPENNLP_LOCATION/ner-date.bin - wget "$URL/en-ner-time.bin" -O $OPENNLP_LOCATION/ner-time.bin - wget "$URL/en-ner-percentage.bin" -O $OPENNLP_LOCATION/ner-percentage.bin - wget "$URL/en-ner-money.bin" -O $OPENNLP_LOCATION/ner-money.bin -fi - -# -------------------------------------------- -# Create RexExp Example for Email on classpath -# -------------------------------------------- -REGEXP_LOCATION="/ner/org/apache/tika/parser/ner/regex" -mkdir -p $REGEXP_LOCATION -echo "EMAIL=(?:[a-z0-9!#$%&'*+/=?^_\`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])" > $REGEXP_LOCATION/ner-regex.txt - - -# ------------------- -# Now run Tika Server -# ------------------- - -# Can be a single implementation or comma seperated list for multiple for "ner.impl.class" property -RECOGNISERS=org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser,org.apache.tika.parser.ner.regex.RegexNERecogniser -# Set classpath to the Tika Server JAR and the /ner folder so it has the configuration and models from above -CLASSPATH="/ner:/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*" -# Run the server with the custom configuration ner.impl.class property and custom /ner/tika-config.xml -exec java -Dner.impl.class=$RECOGNISERS -cp $CLASSPATH org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 -c /ner/tika-config.xml \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/ner/tika-config.xml b/tika-server/docker-build/sample-configs/ner/tika-config.xml deleted file mode 100644 index 65d5774c22..0000000000 --- a/tika-server/docker-build/sample-configs/ner/tika-config.xml +++ /dev/null @@ -1,28 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> -<properties> - <parsers> - <parser class="org.apache.tika.parser.ner.NamedEntityParser"> - <mime>application/pdf</mime> - <mime>text/plain</mime> - <mime>text/html</mime> - <mime>application/xhtml+xml</mime> - </parser> - </parsers> -</properties> - diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml deleted file mode 100644 index c70c207b28..0000000000 --- a/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml +++ /dev/null @@ -1,32 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> -<properties> - <parsers> - <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> - <mime>image/jpeg</mime> - <mime>image/png</mime> - <mime>image/gif</mime> - <params> - <param name="apiBaseUri" type="uri">http://inception-caption:8764/inception/v3</param> - <param name="captions" type="int">5</param> - <param name="maxCaptionLength" type="int">15</param> - <param name="class" type="string">org.apache.tika.parser.captioning.tf.TensorflowRESTCaptioner</param> - </params> - </parser> - </parsers> -</properties> \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml deleted file mode 100644 index f6a4e6a938..0000000000 --- a/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml +++ /dev/null @@ -1,32 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> -<properties> - <parsers> - <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> - <mime>video/mp4</mime> - <mime>video/quicktime</mime> - <params> - <param name="apiBaseUri" type="uri">http://inception-video:8764/inception/v4</param> - <param name="topN" type="int">4</param> - <param name="minConfidence" type="double">0.015</param> - <param name="mode" type="string">fixed</param> - <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTVideoRecogniser</param> - </params> - </parser> - </parsers> -</properties> \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest.xml b/tika-server/docker-build/sample-configs/vision/inception-rest.xml deleted file mode 100644 index caa6468595..0000000000 --- a/tika-server/docker-build/sample-configs/vision/inception-rest.xml +++ /dev/null @@ -1,32 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - ~ Licensed to the Apache Software Foundation (ASF) under one or more - ~ contributor license agreements. See the NOTICE file distributed with - ~ this work for additional information regarding copyright ownership. - ~ The ASF licenses this file to You under the Apache License, Version 2.0 - ~ (the "License"); you may not use this file except in compliance with - ~ the License. You may obtain a copy of the License at - ~ - ~ http://www.apache.org/licenses/LICENSE-2.0 - ~ - ~ Unless required by applicable law or agreed to in writing, software - ~ distributed under the License is distributed on an "AS IS" BASIS, - ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ~ See the License for the specific language governing permissions and - ~ limitations under the License. - --> -<properties> - <parsers> - <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> - <mime>image/jpeg</mime> - <mime>image/png</mime> - <mime>image/gif</mime> - <params> - <param name="apiBaseUri" type="uri">http://inception-rest:8764/inception/v4</param> - <param name="topN" type="int">2</param> - <param name="minConfidence" type="double">0.015</param> - <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser</param> - </params> - </parser> - </parsers> -</properties> diff --git a/tika-server/docker-build/sample-configs/vision/vlm-claude.json b/tika-server/docker-build/sample-configs/vision/vlm-claude.json new file mode 100644 index 0000000000..e233516b61 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/vlm-claude.json @@ -0,0 +1,18 @@ +{ + "_comment": [ + "Vision-Language Model parsing via Anthropic's Claude API.", + "Claude can handle OCR images and PDFs natively (no rasterization needed).", + "Set apiKey to your Anthropic API key — DO NOT commit a real key.", + "Prefer passing it via the ANTHROPIC_API_KEY env var and substituting it", + "at container start, e.g. via an entrypoint shim or sidecar that templates", + "this file. See docs: configuration/parsers/vlm-parsers." + ], + "parsers": [ + { + "claude-vlm-parser": { + "apiKey": "${ANTHROPIC_API_KEY}", + "model": "claude-sonnet-4-20250514" + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/vision/vlm-gemini.json b/tika-server/docker-build/sample-configs/vision/vlm-gemini.json new file mode 100644 index 0000000000..4c33e69f3c --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/vlm-gemini.json @@ -0,0 +1,17 @@ +{ + "_comment": [ + "Vision-Language Model parsing via Google's Gemini generateContent API.", + "Gemini can handle OCR images and PDFs natively (no rasterization needed).", + "Set apiKey to your Google AI Studio API key — DO NOT commit a real key.", + "Prefer GEMINI_API_KEY env var + a templating entrypoint, similar to the", + "Claude config. See docs: configuration/parsers/vlm-parsers." + ], + "parsers": [ + { + "gemini-vlm-parser": { + "apiKey": "${GEMINI_API_KEY}", + "model": "gemini-2.5-flash" + } + } + ] +} diff --git a/tika-server/docker-build/sample-configs/vision/vlm-openai.json b/tika-server/docker-build/sample-configs/vision/vlm-openai.json new file mode 100644 index 0000000000..2a4b675ddb --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/vlm-openai.json @@ -0,0 +1,19 @@ +{ + "_comment": [ + "Vision-Language Model parsing via an OpenAI-compatible endpoint.", + "Works with self-hosted backends (vLLM, Ollama, a local FastAPI wrapper)", + "or against OpenAI's own chat-completions API. Set baseUrl to wherever", + "the OpenAI-compatible endpoint is reachable from the tika container.", + "If the endpoint requires authentication, also set apiKey.", + "See docs: configuration/parsers/vlm-parsers." + ], + "parsers": [ + { + "openai-vlm-parser": { + "baseUrl": "http://vlm-server:8000", + "model": "jinaai/jina-vlm", + "timeoutSeconds": 300 + } + } + ] +}
