(tika) 03/04: TIKA-4720: add JunkFilterEncodingDetector

tallison Sat, 25 Apr 2026 15:03:38 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4720-wiring
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 40a75367965a8c0589afed4e2f816af0e91b0966
Author: tallison <[email protected]>
AuthorDate: Fri Apr 24 17:14:55 2026 -0400

    TIKA-4720: add JunkFilterEncodingDetector
---
 tika-ml/pom.xml                                    |  13 +
 tika-ml/tika-ml-junkdetect/pom.xml                 |   6 +
 .../apache/tika/ml/junkdetect/JunkDetector.java    |  18 ++
 .../ml/junkdetect/JunkFilterEncodingDetector.java  | 301 +++++++++++++++++++++
 .../junkdetect/JunkFilterEncodingDetectorTest.java | 164 +++++++++++
 .../pom.xml                                        |   6 +
 .../tika/config/TikaEncodingDetectorTest.java      |  42 ++-
 .../TIKA-2485-encoding-detector-mark-limits.json   |   3 +
 8 files changed, 531 insertions(+), 22 deletions(-)

diff --git a/tika-ml/pom.xml b/tika-ml/pom.xml
index 5c9cf03af1..11ddcb221c 100644
--- a/tika-ml/pom.xml
+++ b/tika-ml/pom.xml
@@ -39,6 +39,19 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <annotationProcessorPaths>
+            <path>
+              <groupId>org.apache.tika</groupId>
+              <artifactId>tika-annotation-processor</artifactId>
+              <version>${project.version}</version>
+            </path>
+          </annotationProcessorPaths>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-checkstyle-plugin</artifactId>
diff --git a/tika-ml/tika-ml-junkdetect/pom.xml 
b/tika-ml/tika-ml-junkdetect/pom.xml
index 672e49195a..39eb506efd 100644
--- a/tika-ml/tika-ml-junkdetect/pom.xml
+++ b/tika-ml/tika-ml-junkdetect/pom.xml
@@ -43,6 +43,12 @@
       <artifactId>tika-core</artifactId>
       <version>${revision}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-annotation-processor</artifactId>
+      <version>${revision}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-ml-core</artifactId>
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index 07aeb64164..1719043f40 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -167,6 +167,24 @@ public final class JunkDetector implements 
TextQualityDetector {
         }
     }
 
+    /**
+     * {@link java.util.ServiceLoader} provider hook (Java 9+).  Allows
+     * {@code JunkDetector} to be registered as a
+     * {@link org.apache.tika.quality.TextQualityDetector} SPI implementation
+     * even though its construction goes through
+     * {@link #loadFromClasspath()} rather than a public no-arg constructor.
+     *
+     * @throws UncheckedIOException if the bundled model cannot be loaded
+     */
+    public static JunkDetector provider() {
+        try {
+            return loadFromClasspath();
+        } catch (IOException e) {
+            throw new java.io.UncheckedIOException(
+                    "Failed to load bundled JunkDetector model", e);
+        }
+    }
+
     /**
      * Loads a model from the given file path.  The file may be gzipped or raw.
      */
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
new file mode 100644
index 0000000000..dcbe1bbe4b
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.quality.TextQualityComparison;
+import org.apache.tika.quality.TextQualityDetector;
+
+/**
+ * A {@link MetaEncodingDetector} that arbitrates charset candidates by
+ * asking a {@link TextQualityDetector} which decoded candidate looks
+ * most like natural text.
+ *
+ * <p>Each base {@link org.apache.tika.detect.EncodingDetector} in the
+ * {@link org.apache.tika.detect.CompositeEncodingDetector} chain emits
+ * candidates into the {@link EncodingDetectorContext}.  This meta detector
+ * then reads the raw probe bytes, decodes them under each unique candidate
+ * charset, and runs pairwise comparisons via
+ * {@link TextQualityDetector#compare} to pick the candidate whose decoding
+ * produces the cleanest text.  BOM-declared, meta-tag-declared,
+ * structural, and statistical candidates all compete on the same footing —
+ * quality of the resulting decode is the sole criterion at this layer.
+ *
+ * <p>The {@link TextQualityDetector} implementation is discovered via
+ * {@link ServiceLoader}.  When no implementation is on the classpath,
+ * this detector becomes a no-op: {@link #detect} returns an empty list
+ * and {@link org.apache.tika.detect.CompositeEncodingDetector} falls
+ * back to its default confidence-based ordering.
+ *
+ * @since Apache Tika 4.0.0 (TIKA-4720)
+ */
+@TikaComponent(name = "junk-filter-encoding-detector")
+public class JunkFilterEncodingDetector implements MetaEncodingDetector {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final Logger LOG =
+            LoggerFactory.getLogger(JunkFilterEncodingDetector.class);
+
+    /** How many probe bytes to read for decoding candidates.  Matches the
+     * default read limit used by the charset base detectors. */
+    private static final int DEFAULT_READ_LIMIT = 16384;
+
+    /** Cached quality detector.  {@code null} if none is on the classpath. */
+    private final TextQualityDetector qualityDetector;
+
+    private int readLimit = DEFAULT_READ_LIMIT;
+
+    public JunkFilterEncodingDetector() {
+        // The junk detector is hardcoded rather than ServiceLoader-discovered
+        // so construction cannot silently fail to register a quality detector
+        // and leave this meta detector as a no-op.  JunkDetector lives in the
+        // same module and loads its bundled model from the classpath.
+        TextQualityDetector q = null;
+        try {
+            q = JunkDetector.loadFromClasspath();
+            LOG.debug("Loaded JunkDetector: {}", q.getClass().getName());
+        } catch (Throwable t) {
+            // A broken model binary (e.g. block-table dimension mismatch
+            // across JVM Unicode versions) would otherwise propagate and
+            // prevent this meta detector from registering at all.  Fail safe:
+            // log and operate as a no-op.
+            LOG.warn("Failed to load JunkDetector; JunkFilterEncodingDetector "
+                    + "will operate as a no-op: {}", t.toString());
+        }
+        this.qualityDetector = q;
+    }
+
+    /** Test-only / deterministic-wiring constructor. */
+    public JunkFilterEncodingDetector(TextQualityDetector qualityDetector) {
+        this.qualityDetector = qualityDetector;
+    }
+
+    public int getReadLimit() {
+        return readLimit;
+    }
+
+    public void setReadLimit(int readLimit) {
+        this.readLimit = readLimit;
+    }
+
+    @Override
+    public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
+                                       ParseContext parseContext) throws 
IOException {
+        if (qualityDetector == null) {
+            return Collections.emptyList();
+        }
+        if (parseContext == null) {
+            return Collections.emptyList();
+        }
+        EncodingDetectorContext context = 
parseContext.get(EncodingDetectorContext.class);
+        if (context == null || context.getResults().isEmpty()) {
+            return Collections.emptyList();
+        }
+
+        Set<Charset> uniqueCharsets = context.getUniqueCharsets();
+        if (uniqueCharsets.size() <= 1) {
+            // Nothing to arbitrate.  Let the composite's default ordering
+            // pick the single candidate.
+            return Collections.emptyList();
+        }
+
+        if (tis == null) {
+            context.setArbitrationInfo("junk-filter-no-stream");
+            return Collections.emptyList();
+        }
+
+        byte[] bytes = readProbe(tis);
+        if (bytes == null || bytes.length == 0) {
+            context.setArbitrationInfo("junk-filter-empty-stream");
+            return Collections.emptyList();
+        }
+        bytes = stripBomBytes(bytes);
+
+        // Decode probe under each candidate, preserving insertion order so
+        // tournament seeding is deterministic.
+        Map<Charset, String> candidates = new LinkedHashMap<>();
+        for (Charset cs : uniqueCharsets) {
+            String decoded = safeDecode(bytes, cs);
+            if (decoded != null && !decoded.isEmpty()) {
+                candidates.put(cs, decoded);
+            }
+        }
+        if (candidates.size() <= 1) {
+            // One or zero candidates produced usable text; nothing to compare.
+            return Collections.emptyList();
+        }
+        if (allDecodingsIdentical(candidates)) {
+            // Byte-identical decodings (typical on pure-ASCII probes).
+            // Text quality cannot distinguish them.  Prefer an author
+            // declaration (BOM / HTML meta charset / HTTP Content-Type)
+            // over statistical or structural candidates: if the document
+            // tells us what it is and the bytes are compatible with that
+            // claim, honour it.
+            Charset declared = pickDeclarative(context, candidates.keySet());
+            if (declared != null) {
+                float conf = context.getTopConfidenceFor(declared);
+                context.setArbitrationInfo("junk-filter-prefer-declarative");
+                return List.of(new EncodingResult(declared, conf));
+            }
+            context.setArbitrationInfo("junk-filter-identical-decodings");
+            return Collections.emptyList();
+        }
+
+        // Pairwise tournament: the first candidate seeds the champion slot;
+        // every subsequent candidate challenges the current champion.
+        Iterator<Map.Entry<Charset, String>> it = 
candidates.entrySet().iterator();
+        Map.Entry<Charset, String> champion = it.next();
+        while (it.hasNext()) {
+            Map.Entry<Charset, String> challenger = it.next();
+            TextQualityComparison cmp = qualityDetector.compare(
+                    champion.getKey().name(), champion.getValue(),
+                    challenger.getKey().name(), challenger.getValue());
+            if ("B".equals(cmp.winner())) {
+                champion = challenger;
+            }
+        }
+
+        float confidence = context.getTopConfidenceFor(champion.getKey());
+        context.setArbitrationInfo("junk-filter-selected");
+        return List.of(new EncodingResult(champion.getKey(), confidence));
+    }
+
+    /**
+     * Return the first DECLARATIVE charset in {@code context} whose charset
+     * is also in {@code eligible}, or {@code null} if no declarative result
+     * matches an eligible candidate.  "Eligible" = present in the candidates
+     * we actually decoded (i.e. excludes candidates that failed to decode).
+     */
+    private static Charset pickDeclarative(EncodingDetectorContext context,
+                                           Set<Charset> eligible) {
+        for (EncodingDetectorContext.Result r : context.getResults()) {
+            for (EncodingResult er : r.getEncodingResults()) {
+                if (er.getResultType() == EncodingResult.ResultType.DECLARATIVE
+                        && eligible.contains(er.getCharset())) {
+                    return er.getCharset();
+                }
+            }
+        }
+        return null;
+    }
+
+    private static boolean allDecodingsIdentical(Map<Charset, String> 
candidates) {
+        String first = null;
+        for (String s : candidates.values()) {
+            if (first == null) {
+                first = s;
+            } else if (!first.equals(s)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private byte[] readProbe(TikaInputStream tis) throws IOException {
+        try {
+            tis.mark(readLimit);
+            byte[] buf = new byte[readLimit];
+            int total = 0;
+            int read;
+            while (total < readLimit
+                    && (read = tis.read(buf, total, readLimit - total)) != -1) 
{
+                total += read;
+            }
+            if (total == 0) {
+                return null;
+            }
+            if (total < readLimit) {
+                byte[] trimmed = new byte[total];
+                System.arraycopy(buf, 0, trimmed, 0, total);
+                return trimmed;
+            }
+            return buf;
+        } finally {
+            tis.reset();
+        }
+    }
+
+    private static String safeDecode(byte[] bytes, Charset charset) {
+        try {
+            return new String(bytes, charset);
+        } catch (Exception e) {
+            LOG.debug("Decode failed for {}: {}", charset.name(), 
e.toString());
+            return null;
+        }
+    }
+
+    /**
+     * Strip a leading byte-order mark, if any.  UTF-32 signatures are
+     * checked before UTF-16 because the UTF-32 LE BOM ({@code FF FE 00 00})
+     * starts with the UTF-16 LE BOM ({@code FF FE}).
+     */
+    private static byte[] stripBomBytes(byte[] bytes) {
+        int bomLen = bomLength(bytes);
+        if (bomLen == 0) {
+            return bytes;
+        }
+        return Arrays.copyOfRange(bytes, bomLen, bytes.length);
+    }
+
+    private static int bomLength(byte[] b) {
+        if (b.length >= 4
+                && (b[0] & 0xFF) == 0x00 && (b[1] & 0xFF) == 0x00
+                && (b[2] & 0xFF) == 0xFE && (b[3] & 0xFF) == 0xFF) {
+            return 4; // UTF-32BE
+        }
+        if (b.length >= 4
+                && (b[0] & 0xFF) == 0xFF && (b[1] & 0xFF) == 0xFE
+                && (b[2] & 0xFF) == 0x00 && (b[3] & 0xFF) == 0x00) {
+            return 4; // UTF-32LE
+        }
+        if (b.length >= 3
+                && (b[0] & 0xFF) == 0xEF && (b[1] & 0xFF) == 0xBB
+                && (b[2] & 0xFF) == 0xBF) {
+            return 3; // UTF-8
+        }
+        if (b.length >= 2
+                && (b[0] & 0xFF) == 0xFE && (b[1] & 0xFF) == 0xFF) {
+            return 2; // UTF-16BE
+        }
+        if (b.length >= 2
+                && (b[0] & 0xFF) == 0xFF && (b[1] & 0xFF) == 0xFE) {
+            return 2; // UTF-16LE
+        }
+        return 0;
+    }
+
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java
new file mode 100644
index 0000000000..0b97a9a0bd
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.EncodingDetectorContext;
+import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.quality.TextQualityComparison;
+import org.apache.tika.quality.TextQualityDetector;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * Unit tests for {@link JunkFilterEncodingDetector}.
+ *
+ * <p>Uses a stub {@link TextQualityDetector} rather than the real
+ * {@link JunkDetector} — we are testing arbitration control flow, not
+ * the quality of the junk detector's decisions.
+ */
+public class JunkFilterEncodingDetectorTest {
+
+    /** Stub quality detector: always picks the label matching {@link 
#preferred}. */
+    private static final class PreferenceStub implements TextQualityDetector {
+        private final String preferred;
+
+        PreferenceStub(String preferred) {
+            this.preferred = preferred;
+        }
+
+        @Override
+        public TextQualityScore score(String text) {
+            return new TextQualityScore(Float.NaN, Float.NaN, Float.NaN,
+                    Float.NaN, "UNKNOWN");
+        }
+
+        @Override
+        public TextQualityComparison compare(String labelA, String candidateA,
+                                             String labelB, String candidateB) 
{
+            String winner = preferred.equals(labelA) ? "A"
+                    : preferred.equals(labelB) ? "B" : "A";
+            return new TextQualityComparison(winner, 0.0f,
+                    score(candidateA), score(candidateB), labelA, labelB);
+        }
+    }
+
+    private static ParseContext contextWith(EncodingResult... results) {
+        EncodingDetectorContext ctx = new EncodingDetectorContext();
+        ctx.addResult(List.of(results), "stub");
+        ParseContext p = new ParseContext();
+        p.set(EncodingDetectorContext.class, ctx);
+        return p;
+    }
+
+    @Test
+    public void picksPreferredCharsetFromTwoCandidates() throws Exception {
+        Charset utf8 = StandardCharsets.UTF_8;
+        Charset win1252 = Charset.forName("windows-1252");
+        // Non-ASCII bytes so UTF-8 and windows-1252 decode to different 
strings
+        // (otherwise arbiter sees identical decodings and abstains).
+        byte[] bytes = "café résumé naïve".getBytes(StandardCharsets.UTF_8);
+
+        ParseContext pc = contextWith(
+                new EncodingResult(utf8, 0.5f, "UTF-8",
+                        EncodingResult.ResultType.STATISTICAL),
+                new EncodingResult(win1252, 0.5f, "windows-1252",
+                        EncodingResult.ResultType.STATISTICAL));
+
+        JunkFilterEncodingDetector detector =
+                new JunkFilterEncodingDetector(new PreferenceStub("UTF-8"));
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            List<EncodingResult> out = detector.detect(tis, new Metadata(), 
pc);
+            assertEquals(1, out.size(), "Expected exactly one result");
+            assertEquals(utf8, out.get(0).getCharset());
+        }
+    }
+
+    @Test
+    public void noopWhenNoQualityDetector() throws Exception {
+        byte[] bytes = "hello".getBytes(StandardCharsets.UTF_8);
+        ParseContext pc = contextWith(
+                new EncodingResult(StandardCharsets.UTF_8, 0.5f, "UTF-8",
+                        EncodingResult.ResultType.STATISTICAL),
+                new EncodingResult(Charset.forName("windows-1252"), 0.5f,
+                        "windows-1252", 
EncodingResult.ResultType.STATISTICAL));
+
+        JunkFilterEncodingDetector detector =
+                new JunkFilterEncodingDetector((TextQualityDetector) null);
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            List<EncodingResult> out = detector.detect(tis, new Metadata(), 
pc);
+            assertTrue(out.isEmpty(),
+                    "No TextQualityDetector → detector must be a no-op");
+        }
+    }
+
+    @Test
+    public void noopWhenOnlyOneCandidate() throws Exception {
+        byte[] bytes = "hello".getBytes(StandardCharsets.UTF_8);
+        ParseContext pc = contextWith(
+                new EncodingResult(StandardCharsets.UTF_8, 0.9f, "UTF-8",
+                        EncodingResult.ResultType.DECLARATIVE));
+
+        JunkFilterEncodingDetector detector =
+                new JunkFilterEncodingDetector(new PreferenceStub("UTF-8"));
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            List<EncodingResult> out = detector.detect(tis, new Metadata(), 
pc);
+            assertTrue(out.isEmpty(),
+                    "Single candidate → no arbitration needed, no-op");
+        }
+    }
+
+    @Test
+    public void noopWhenAllDecodingsIdentical() throws Exception {
+        // Pure-ASCII bytes decode identically under UTF-8 and windows-1252.
+        byte[] bytes = "plain ascii 
content".getBytes(StandardCharsets.US_ASCII);
+
+        ParseContext pc = contextWith(
+                new EncodingResult(StandardCharsets.UTF_8, 0.5f, "UTF-8",
+                        EncodingResult.ResultType.STATISTICAL),
+                new EncodingResult(Charset.forName("windows-1252"), 0.5f,
+                        "windows-1252", 
EncodingResult.ResultType.STATISTICAL));
+
+        JunkFilterEncodingDetector detector =
+                new JunkFilterEncodingDetector(new PreferenceStub("UTF-8"));
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            List<EncodingResult> out = detector.detect(tis, new Metadata(), 
pc);
+            assertTrue(out.isEmpty(),
+                    "Byte-identical decodings → arbiter abstains");
+            assertEquals("junk-filter-identical-decodings",
+                    
pc.get(EncodingDetectorContext.class).getArbitrationInfo());
+        }
+    }
+
+    // NOTE: a full default-constructor integration test (which would load
+    // the bundled JunkDetector via ServiceLoader) is not included here
+    // because JunkDetector currently exposes only static factory methods
+    // (loadFromClasspath / loadFromPath / load) and has no public no-arg
+    // constructor — ServiceLoader cannot instantiate it. Wiring JunkDetector
+    // up as a proper SPI provider is tracked as follow-up work for TIKA-4720;
+    // at that point this test can be added to exercise the real SPI path.
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml
index f2dfdbb122..1bee0907b0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml
@@ -95,6 +95,12 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-ml-junkdetect</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-serialization</artifactId>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index a7910c9f77..931b0df0c6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -28,7 +28,6 @@ import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
 
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaLoaderHelper;
@@ -38,6 +37,7 @@ import org.apache.tika.detect.BOMDetector;
 import org.apache.tika.detect.CompositeEncodingDetector;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.detect.MetaEncodingDetector;
 import org.apache.tika.detect.MetadataCharsetDetector;
 import org.apache.tika.detect.OverrideEncodingDetector;
 import org.apache.tika.exception.TikaConfigException;
@@ -61,10 +61,12 @@ public class TikaEncodingDetectorTest extends TikaTest {
         EncodingDetector detector = 
TikaLoader.loadDefault().loadEncodingDetectors();
         assertTrue(detector instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors = ((CompositeEncodingDetector) 
detector).getDetectors();
-        // 4 base detectors (BOM, Metadata, ML, HtmlEncodingDetector); no 
MetaEncodingDetector in default chain
-        assertEquals(4, detectors.size());
+        // 4 base detectors (BOM, Metadata, ML, HtmlEncodingDetector) + 
JunkFilter (MetaEncodingDetector)
+        assertEquals(5, detectors.size());
+        // meta detector is always last (partitioned by 
CompositeEncodingDetector)
+        assertTrue(detectors.get(4) instanceof MetaEncodingDetector);
         // base detectors — sorted by full class name; check by type
-        Set<Class<?>> baseClasses = detectors.stream()
+        Set<Class<?>> baseClasses = detectors.subList(0, 4).stream()
                 .map(Object::getClass).collect(Collectors.toSet());
         assertTrue(baseClasses.contains(BOMDetector.class));
         assertTrue(baseClasses.contains(MetadataCharsetDetector.class));
@@ -85,13 +87,14 @@ public class TikaEncodingDetectorTest extends TikaTest {
         assertTrue(detector1 instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors1Children =
                 ((CompositeEncodingDetector) detector1).getDetectors();
-        // BOM + Metadata + ML base detectors (html excluded, no meta)
-        assertEquals(3, detectors1Children.size());
-        Set<Class<?>> innerClasses = detectors1Children.stream()
+        // BOM + Metadata + ML base detectors + JunkFilter meta (html excluded)
+        assertEquals(4, detectors1Children.size());
+        Set<Class<?>> innerClasses = detectors1Children.subList(0, 3).stream()
                 .map(Object::getClass).collect(Collectors.toSet());
         assertTrue(innerClasses.contains(BOMDetector.class));
         assertTrue(innerClasses.contains(MetadataCharsetDetector.class));
         assertTrue(innerClasses.contains(MojibusterEncodingDetector.class));
+        assertTrue(detectors1Children.get(3) instanceof MetaEncodingDetector);
 
         assertTrue(detectors.get(1) instanceof OverrideEncodingDetector);
 
@@ -183,9 +186,9 @@ public class TikaEncodingDetectorTest extends TikaTest {
                     ((AbstractEncodingDetectorParser) encodingDetectingParser)
                             .getEncodingDetector();
             assertTrue(encodingDetector instanceof CompositeEncodingDetector);
-            // BOM, Metadata, ML, Html base detectors
-            // (ICU4J is excluded but was already not in the default chain; no 
meta)
-            assertEquals(4, ((CompositeEncodingDetector) 
encodingDetector).getDetectors().size());
+            // BOM, Metadata, ML, Html base detectors + JunkFilter meta
+            // (ICU4J is excluded but was already not in the default chain)
+            assertEquals(5, ((CompositeEncodingDetector) 
encodingDetector).getDetectors().size());
             for (EncodingDetector child : ((CompositeEncodingDetector) 
encodingDetector)
                     .getDetectors()) {
                 assertNotContained("cu4j", 
child.getClass().getCanonicalName());
@@ -212,8 +215,8 @@ public class TikaEncodingDetectorTest extends TikaTest {
             assertTrue(encodingDetector instanceof CompositeEncodingDetector);
             List<EncodingDetector> children =
                     ((CompositeEncodingDetector) 
encodingDetector).getDetectors();
-            // 3 base detectors, no meta
-            assertEquals(3, children.size(), 
childParser.getClass().toString());
+            // 3 base detectors + 1 MetaEncodingDetector (JunkFilter)
+            assertEquals(4, children.size(), 
childParser.getClass().toString());
             assertTrue(children.get(0) instanceof MojibusterEncodingDetector,
                     childParser.getClass().toString());
             HtmlEncodingDetector htmlDet = (HtmlEncodingDetector) 
children.get(1);
@@ -221,12 +224,11 @@ public class TikaEncodingDetectorTest extends TikaTest {
                     childParser.getClass().toString());
             assertTrue(children.get(2) instanceof StandardHtmlEncodingDetector,
                     childParser.getClass().toString());
+            assertTrue(children.get(3) instanceof MetaEncodingDetector,
+                    childParser.getClass().toString());
         }
     }
 
-    @Disabled("TIKA-4720: needs JunkFilterEncodingDetector meta-arbiter so the 
"
-            + "mark-limit-raised HTML detector's DECLARATIVE UTF-8 beats NB's "
-            + "STATISTICAL windows-1252")
     @Test
     public void testMarkLimitIntegration() throws Exception {
         StringBuilder sb = new StringBuilder();
@@ -279,14 +281,10 @@ public class TikaEncodingDetectorTest extends TikaTest {
     /**
      * ASCII HTML with an explicit {@code <meta charset="UTF-8">} must be
      * detected as UTF-8.  The HTML detector produces a DECLARATIVE UTF-8
-     * result which outranks the statistical windows-1252 fallback.
-     *
-     * <p>Disabled pending reinstatement of a {@code MetaEncodingDetector}
-     * (TIKA-4720): without one in the chain, Mojibuster's STATISTICAL
-     * windows-1252 beats the HTML detector's DECLARATIVE UTF-8. Re-enable
-     * once {@code JunkFilterEncodingDetector} lands.
+     * result; {@code JunkFilterEncodingDetector} arbitrates the tie in its
+     * favour (pure-ASCII bytes decode identically as UTF-8 and windows-1252,
+     * so the DECLARATIVE hint wins).
      */
-    @Disabled("TIKA-4720: needs JunkFilterEncodingDetector meta-arbiter")
     @Test
     public void testAsciiHtmlWithMetaIsDetectedAsUtf8() throws Exception {
         byte[] bytes =
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
index 7331a02325..d4f2483cad 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
@@ -12,6 +12,9 @@
       "standard-html-encoding-detector": {
         "markLimit": 100000
       }
+    },
+    {
+      "junk-filter-encoding-detector": {}
     }
   ]
 }

(tika) 03/04: TIKA-4720: add JunkFilterEncodingDetector

Reply via email to