This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4b78311adb TIKA-4683 -- charset detector dep mgmt and order in 
AutoDetectReader (#2800)
4b78311adb is described below

commit 4b78311adb3f3836d6852ff3f27863cd7f43d20e
Author: Tim Allison <[email protected]>
AuthorDate: Sat May 2 13:38:13 2026 -0400

    TIKA-4683 -- charset detector dep mgmt and order in AutoDetectReader (#2800)
---
 .../java/org/apache/tika/detect/AutoDetectReader.java     | 10 ++++++----
 .../tika-parsers-standard-package/pom.xml                 | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java 
b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index a86eb5c2bd..f17844bf78 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -47,8 +47,11 @@ public class AutoDetectReader extends BufferedReader {
     private static final EncodingDetector DEFAULT_DETECTOR;
 
     static {
-        DEFAULT_DETECTOR = new CompositeEncodingDetector(
-                DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
+        // Use DefaultEncodingDetector so SPI-discovered detectors are run in 
the
+        // pinned order (HtmlEncodingDetector, UniversalEncodingDetector, 
Icu4jEncodingDetector,
+        // then anything else by class name). Otherwise the order would be 
whatever
+        // ServiceLoader yields from classpath/jar order, which is fragile.
+        DEFAULT_DETECTOR = new DefaultEncodingDetector(DEFAULT_LOADER);
     }
 
     private final Charset charset;
@@ -79,8 +82,7 @@ public class AutoDetectReader extends BufferedReader {
 
     public AutoDetectReader(InputStream stream, Metadata metadata, 
ServiceLoader loader)
             throws IOException, TikaException {
-        this(getTikaInputStream(stream), metadata,
-                new 
CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)));
+        this(getTikaInputStream(stream), metadata, new 
DefaultEncodingDetector(loader));
     }
 
     public AutoDetectReader(InputStream stream, Metadata metadata)
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 14d53e320c..670d29442c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -68,6 +68,21 @@
       <artifactId>tika-parser-digest-commons</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-encoding-detector-html</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-encoding-detector-icu4j</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-encoding-detector-universal</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-font-module</artifactId>

Reply via email to