This is an automated email from the ASF dual-hosted git repository.
tballison pushed a change to branch charset-move-to-nb
in repository https://gitbox.apache.org/repos/asf/tika.git
from 0f06c14467 rat
add 0ae889f3cb TIKA-4703: Pin docker/* actions to SHA digests per ASF
policy (INFRA-27837) (#2779)
add c0d5296db3 TIKA-4703: Upgrade GitHub Actions to Node.js 24 compatible
versions (#2780)
add dc99fffa04 TIKA-4703: Fix Docker Hub secret name DOCKERHUB_USERNAME ->
DOCKERHUB_USER (#2781)
add 4a43c2ccc0 TIKA-4703: Fix chmod failure in tika-grpc Dockerfile on CI
(#2782)
add e0d4a6d7f3 remove bestMatch (#2785)
add 5dfa028a89 improve legacy charset detector to benefit from features of
StandardHtmlEncodingDetector (#2786)
add 3e45863860 Merge branch 'main' into charset-move-to-nb
No new revisions were added by this update.
Summary of changes:
.github/workflows/docker-release.yml | 28 +--
.github/workflows/docker-snapshot.yml | 18 +-
.github/workflows/main-jdk17-build.yml | 8 +-
.../main-jdk17-windows-build-multi-locale.yml | 4 +-
.github/workflows/main-jdk17-windows-build.yml | 4 +-
.github/workflows/main-jdk21-build.yml | 4 +-
.github/workflows/main-jdk25-build.yml | 4 +-
.../pages/advanced/charset-detection-design.adoc | 192 ++++++++++++++++++++-
.../pages/configuration/encoding-detectors.adoc | 26 +--
.../charsoup/CharSoupEncodingDetector.java | 48 ++++--
.../tika/parser/html/HtmlEncodingDetector.java | 8 +-
.../tika/parser/html/TikaHtmlCharsetAliases.java | 172 ++++++++++++++++++
.../parser/html/charsetdetector/PreScanner.java | 17 --
.../StandardHtmlEncodingDetector.java | 66 ++-----
tika-grpc/docker-build/Dockerfile | 3 +-
.../charsoup/GenerativeLanguageModel.java | 48 ++----
.../tika/config/TikaEncodingDetectorTest.java | 43 +----
...IKA-2273-exclude-encoding-detector-default.json | 2 +-
.../configs/tika-config-html-standalone-bom.json | 9 -
.../tika/parser/html/HtmlEncodingDetectorTest.java | 58 +++++++
.../html/StandardHtmlEncodingDetectorTest.java | 30 +---
21 files changed, 545 insertions(+), 247 deletions(-)
create mode 100644
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/TikaHtmlCharsetAliases.java
delete mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-html-standalone-bom.json