This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4b78311adb TIKA-4683 -- charset detector dep mgmt and order in
AutoDetectReader (#2800)
4b78311adb is described below
commit 4b78311adb3f3836d6852ff3f27863cd7f43d20e
Author: Tim Allison <[email protected]>
AuthorDate: Sat May 2 13:38:13 2026 -0400
TIKA-4683 -- charset detector dep mgmt and order in AutoDetectReader (#2800)
---
.../java/org/apache/tika/detect/AutoDetectReader.java | 10 ++++++----
.../tika-parsers-standard-package/pom.xml | 15 +++++++++++++++
2 files changed, 21 insertions(+), 4 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index a86eb5c2bd..f17844bf78 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -47,8 +47,11 @@ public class AutoDetectReader extends BufferedReader {
private static final EncodingDetector DEFAULT_DETECTOR;
static {
- DEFAULT_DETECTOR = new CompositeEncodingDetector(
- DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
+ // Use DefaultEncodingDetector so SPI-discovered detectors are run in
the
+ // pinned order (HtmlEncodingDetector, UniversalEncodingDetector,
Icu4jEncodingDetector,
+ // then anything else by class name). Otherwise the order would be
whatever
+ // ServiceLoader yields from classpath/jar order, which is fragile.
+ DEFAULT_DETECTOR = new DefaultEncodingDetector(DEFAULT_LOADER);
}
private final Charset charset;
@@ -79,8 +82,7 @@ public class AutoDetectReader extends BufferedReader {
public AutoDetectReader(InputStream stream, Metadata metadata,
ServiceLoader loader)
throws IOException, TikaException {
- this(getTikaInputStream(stream), metadata,
- new
CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)));
+ this(getTikaInputStream(stream), metadata, new
DefaultEncodingDetector(loader));
}
public AutoDetectReader(InputStream stream, Metadata metadata)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 14d53e320c..670d29442c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -68,6 +68,21 @@
<artifactId>tika-parser-digest-commons</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-html</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-icu4j</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-universal</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-font-module</artifactId>