(tika) branch main updated: charset and junk tweaks (#2794)

tallison Wed, 29 Apr 2026 13:28:17 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 66a83d3441 charset and junk tweaks (#2794)
66a83d3441 is described below

commit 66a83d3441767ab8f613787d629deabc674124bd
Author: Tim Allison <[email protected]>
AuthorDate: Wed Apr 29 16:27:01 2026 -0400

    charset and junk tweaks (#2794)
---
 .../tika/detect/CompositeEncodingDetector.java     |  31 +++-
 .../tika/ml/chardetect/HtmlByteStripper.java       |  16 ++
 .../ml/chardetect/MojibusterEncodingDetector.java  | 102 ++++++++++-
 .../ml/chardetect/StructuralEncodingRules.java     | 108 ++++++++++++
 .../ml/junkdetect/JunkFilterEncodingDetector.java  | 186 ++++++++++++++++++++-
 .../apache/tika/ml/junkdetect/EntityRefProbe.java  | 164 ++++++++++++++++++
 6 files changed, 598 insertions(+), 9 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index fc8b0ab038..ab554d5d7b 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -109,18 +109,22 @@ public class CompositeEncodingDetector implements 
EncodingDetector, Serializable
                                                   ParseContext parseContext)
             throws IOException {
         for (EncodingDetector detector : getDetectors()) {
+            String name = detector.getClass().getSimpleName();
+            LOG.trace("chardet enter {}", name);
             List<EncodingResult> results = detector.detect(tis, metadata, 
parseContext);
             if (!results.isEmpty()) {
                 Charset detected = results.get(0).getCharset();
+                LOG.trace("chardet hit  {} -> {} [{}] conf={}", name, 
detected.name(),
+                        results.get(0).getResultType(), 
results.get(0).getConfidence());
                 metadata.set(TikaCoreProperties.DETECTED_ENCODING, 
detected.name());
-                if (!detector.getClass().getSimpleName()
-                        .equals("CompositeEncodingDetector")) {
-                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
-                            detector.getClass().getSimpleName());
+                if (!name.equals("CompositeEncodingDetector")) {
+                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR, name);
                 }
                 return results;
             }
+            LOG.trace("chardet miss {} (empty)", name);
         }
+        LOG.trace("chardet no detector returned a result");
         return Collections.emptyList();
     }
 
@@ -142,16 +146,35 @@ public class CompositeEncodingDetector implements 
EncodingDetector, Serializable
 
         try {
             for (EncodingDetector detector : baseDetectors) {
+                String name = detector.getClass().getSimpleName();
+                LOG.trace("chardet enter {}", name);
                 List<EncodingResult> detected = detector.detect(tis, metadata, 
parseContext);
                 if (!detected.isEmpty()) {
+                    LOG.trace("chardet emit {} -> {} [{}] conf={}", name,
+                            detected.get(0).getCharset().name(),
+                            detected.get(0).getResultType(),
+                            detected.get(0).getConfidence());
                     context.addResult(detected, 
detector.getClass().getSimpleName());
+                } else {
+                    LOG.trace("chardet miss {} (empty)", name);
                 }
             }
 
             // Each base detector handles its own mark/reset, so the stream is
             // back at the start here. CharSoup handles its own mark/reset too.
+            String metaName = metaDetector.getClass().getSimpleName();
+            LOG.trace("chardet meta enter {} (candidates={})", metaName,
+                    context.getUniqueCharsets());
             List<EncodingResult> metaResults =
                     metaDetector.detect(tis, metadata, parseContext);
+            if (!metaResults.isEmpty()) {
+                LOG.trace("chardet meta {} -> {} (arbitration={})", metaName,
+                        metaResults.get(0).getCharset().name(),
+                        context.getArbitrationInfo());
+            } else {
+                LOG.trace("chardet meta {} abstained (arbitration={})", 
metaName,
+                        context.getArbitrationInfo());
+            }
 
             List<EncodingResult> finalResults;
             String detectorName;
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
index babd931e7e..1443ae9723 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
@@ -61,6 +61,11 @@ public final class HtmlByteStripper {
     private static final int RAW_BODY = 7;
     private static final int ATTR_NAME = 8;
     private static final int ATTR_AFTER_EQUALS = 9;
+    /** Inside a markup declaration like {@code <!DOCTYPE html ...>} or
+     *  a processing instruction like {@code <?xml version="1.0"?>}. Both
+     *  end at the next {@code >}. Internal subsets ({@code <!DOCTYPE foo [ 
... ]>})
+     *  are rare; we'd stop at the first nested {@code >}. Acceptable. */
+    private static final int DECL_OR_PI = 10;
 
     private static final byte[] SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
     private static final byte[] STYLE = {'s', 't', 'y', 'l', 'e'};
@@ -158,6 +163,11 @@ public final class HtmlByteStripper {
                         state = COMMENT;
                         tagCount++;
                         i += 2;
+                    } else if (b == '!' || b == '?') {
+                        // <!DOCTYPE ...>, <!ATTLIST ...>, <?xml ...?>, etc.
+                        // Consume bytes up to the next '>'.
+                        state = DECL_OR_PI;
+                        tagCount++;
                     } else if (b == '/' || isAsciiLetter(b)) {
                         state = TAG_NAME;
                         nameStart = i;
@@ -173,6 +183,12 @@ public final class HtmlByteStripper {
                     }
                     break;
 
+                case DECL_OR_PI:
+                    if (b == '>') {
+                        state = TEXT;
+                    }
+                    break;
+
                 case TAG_NAME:
                     if (isTagNameTerminator(b)) {
                         int nameLen = i - nameStart;
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index d1746b3781..ca6b71c0ff 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -25,6 +25,8 @@ import java.util.List;
 import java.util.Locale;
 
 import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.detect.EncodingDetector;
@@ -67,6 +69,9 @@ import org.apache.tika.parser.ParseContext;
 @TikaComponent(name = "mojibuster-encoding-detector")
 public class MojibusterEncodingDetector implements EncodingDetector {
 
+    private static final Logger LOG =
+            LoggerFactory.getLogger(MojibusterEncodingDetector.class);
+
     /** Default NB bigram model on the classpath. */
     public static final String DEFAULT_MODEL_RESOURCE =
             "/org/apache/tika/ml/chardetect/nb-bigram.bin";
@@ -121,6 +126,24 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
     /** Confidence for the windows-1252 fallback emitted on empty/ASCII 
probes. */
     private static final float FALLBACK_CONFIDENCE = 0.1f;
 
+    /**
+     * Maximum fraction of malformed-UTF-8 bytes we tolerate before
+     * disqualifying NB's UTF-8 pick.  Real-world UTF-8 files often contain
+     * one or two corrupted bytes (copy-paste accidents, truncation,
+     * transport flips) — rejecting them outright would force the detector
+     * to drop a high-confidence UTF-8 classification on otherwise-valid
+     * text and fall through to {@code AutoDetectReader.detect}, which
+     * raises {@code TikaException} when the chain returns no candidates.
+     * 0.5% (1 byte per 200) accommodates "tiny corruption" while still
+     * rejecting genuinely-non-UTF-8 streams (which would have many more
+     * malformed bytes).
+     *
+     * <p>TACTICAL: remove or revisit when Mojibuster's UTF-8 grammar
+     * check is replaced with a probabilistic decoder that returns a
+     * confidence score directly.</p>
+     */
+    private static final double UTF8_MALFORMED_TOLERANCE = 0.005;
+
     /** Windows-1252: the WHATWG-canonical default for unlabeled Western 
content. */
     private static final String WIN1252 = "windows-1252";
 
@@ -173,12 +196,18 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
      * structural checks, which need byte alignment intact.
      */
     public List<EncodingResult> detect(byte[] probe, Metadata metadata) {
+        if (LOG.isTraceEnabled()) {
+            int probeLen = probe == null ? 0 : probe.length;
+            int highBytes = probe == null ? 0 : countHighBytes(probe);
+            LOG.trace("mojibuster enter probe={}B highBytes={}", probeLen, 
highBytes);
+        }
         // Empty / near-empty probes: return the WHATWG default so
         // downstream callers don't see an empty list (which propagates
         // up as "Failed to detect the character encoding of a
         // document" in TXTParser / RFC822Parser / etc).  windows-1252
         // at low confidence lets any declarative hint override.
         if (probe == null || probe.length < 2) {
+            LOG.trace("mojibuster -> windows-1252 fallback (probe<2B)");
             return windows1252Fallback();
         }
 
@@ -190,6 +219,7 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         // consulting NB so we don't hand back a bias-driven x-MacRoman
         // or IBM850 pick.
         if (isPureAscii(probe)) {
+            LOG.trace("mojibuster -> windows-1252 fallback (pure ASCII)");
             return windows1252Fallback();
         }
 
@@ -207,6 +237,8 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         // UTF-32 codepoint validity — structural candidate.  Also
         // collects UTF-16 surrogate invalidity flags used below.
         WideUnicodeDetector.Result wide = WideUnicodeDetector.analyze(probe);
+        LOG.trace("mojibuster wideUnicode charset={} invalidLE={} 
invalidBE={}",
+                wide.charset, wide.invalidUtf16Le, wide.invalidUtf16Be);
         if (wide.charset != null) {
             pool.add(new EncodingResult(wide.charset, UTF32_STRUCTURAL_CONF,
                     wide.charset.name(), 
EncodingResult.ResultType.STRUCTURAL));
@@ -222,14 +254,19 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         // discriminate UTF-16 reliably (see
         // why-stride1-bigrams-dont-work-for-utf16.md), so we keep
         // UTF-16 out of NB training and delegate to the specialist.
-        if (StructuralEncodingRules.has2ByteColumnAsymmetryEvidence(probe)) {
+        boolean utf16Gate = 
StructuralEncodingRules.has2ByteColumnAsymmetryEvidence(probe);
+        LOG.trace("mojibuster utf16Gate={}", utf16Gate);
+        if (utf16Gate) {
             List<EncodingResult> utf16Results = utf16.detect(probe);
+            LOG.trace("mojibuster utf16Specialist returned {} candidates", 
utf16Results.size());
             for (EncodingResult r : utf16Results) {
                 String name = r.getCharset().name();
                 boolean invalid =
                         ("UTF-16LE".equals(name) && wide.invalidUtf16Le)
                         || ("UTF-16BE".equals(name) && wide.invalidUtf16Be);
+                LOG.trace("mojibuster utf16Specialist candidate={} 
invalid={}", name, invalid);
                 if (!invalid) {
+                    LOG.trace("mojibuster -> utf16 short-circuit {}", name);
                     return List.of(new EncodingResult(r.getCharset(),
                             UTF32_STRUCTURAL_CONF, r.getLabel(),
                             EncodingResult.ResultType.STRUCTURAL));
@@ -251,6 +288,29 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         //   • AMBIGUOUS (pure ASCII or only truncated lead): no
         //     emission; NB + fallbacks handle it.
         StructuralEncodingRules.Utf8Result utf8 = 
StructuralEncodingRules.checkUtf8(probe);
+        // TACTICAL: tolerate small corruption.  If the grammar check returned
+        // NOT_UTF8 but the malformed-byte fraction is tiny, treat as UTF-8 —
+        // a single bad continuation byte in 2KB of CJK is nearly always
+        // corruption, not "this isn't UTF-8".  Remove when grammar check is
+        // replaced with a probabilistic decoder.
+        boolean utf8Tolerated = false;
+        if (utf8 == StructuralEncodingRules.Utf8Result.NOT_UTF8) {
+            int errors = StructuralEncodingRules.countUtf8Errors(probe);
+            if (errors > 0
+                    && (double) errors / probe.length <= 
UTF8_MALFORMED_TOLERANCE) {
+                utf8Tolerated = true;
+                LOG.trace("mojibuster utf8 NOT_UTF8 tolerated: {} error events 
in {}B ({}%)",
+                        errors, probe.length,
+                        String.format(Locale.ROOT, "%.3f",
+                                100.0 * errors / probe.length));
+            } else if (errors > 0) {
+                LOG.trace("mojibuster utf8 NOT_UTF8 NOT tolerated: {} error 
events in {}B ({}%)",
+                        errors, probe.length,
+                        String.format(Locale.ROOT, "%.3f",
+                                100.0 * errors / probe.length));
+            }
+        }
+        LOG.trace("mojibuster utf8Check={} tolerated={}", utf8, utf8Tolerated);
         if (utf8 == StructuralEncodingRules.Utf8Result.LIKELY_UTF8) {
             pool.add(new EncodingResult(
                     java.nio.charset.StandardCharsets.UTF_8,
@@ -267,11 +327,22 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
 
         // Naive-Bayes top-K candidates — statistical.
         List<EncodingResult> nbResults = nb.detect(nbInput);
+        if (LOG.isTraceEnabled()) {
+            StringBuilder sb = new StringBuilder();
+            for (EncodingResult r : nbResults) {
+                if (sb.length() > 0) sb.append(", ");
+                sb.append(r.getCharset().name())
+                  .append("@").append(String.format(Locale.ROOT, "%.2f", 
r.getConfidence()));
+            }
+            LOG.trace("mojibuster nb({}B input) -> [{}]", nbInput.length, sb);
+        }
         for (EncodingResult r : nbResults) {
             String name = r.getCharset().name();
-            // NOT_UTF8 disqualifier.
+            // NOT_UTF8 disqualifier — applied unless the malformed-byte
+            // fraction is tiny (see UTF8_MALFORMED_TOLERANCE).
             if ("UTF-8".equals(name)
-                    && utf8 == StructuralEncodingRules.Utf8Result.NOT_UTF8) {
+                    && utf8 == StructuralEncodingRules.Utf8Result.NOT_UTF8
+                    && !utf8Tolerated) {
                 continue;
             }
             pool.add(r);
@@ -281,7 +352,30 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         // Low-evidence Latin-sibling → windows-1252 rewrite.  Runs
         // after sort so only the final top candidate is considered
         // for the rewrite, preserving lower-ranked siblings.
-        return applyLatinSiblingFallback(probe, ranked);
+        List<EncodingResult> finalResults = applyLatinSiblingFallback(probe, 
ranked);
+        // Never return an empty list.  An empty result propagates up as
+        // "Failed to detect the character encoding of a document" in
+        // AutoDetectReader.detect, which kills parsing entirely.  When
+        // every layer has rejected its candidates (NOT_UTF8 disqualifier
+        // dropped NB's only pick, NB returned no candidates at all,
+        // wide-Unicode and UTF-16 specialists abstained), fall back to
+        // the WHATWG default.  Downstream JunkFilter / declarative
+        // candidates can still override at low confidence.
+        if (finalResults.isEmpty()) {
+            LOG.trace("mojibuster pool empty -> windows-1252 fallback");
+            return windows1252Fallback();
+        }
+        if (LOG.isTraceEnabled()) {
+            StringBuilder sb = new StringBuilder();
+            for (EncodingResult r : finalResults) {
+                if (sb.length() > 0) sb.append(", ");
+                sb.append(r.getCharset().name())
+                  .append("[").append(r.getResultType()).append("]")
+                  .append("@").append(String.format(Locale.ROOT, "%.2f", 
r.getConfidence()));
+            }
+            LOG.trace("mojibuster exit ({} results) [{}]", 
finalResults.size(), sb);
+        }
+        return finalResults;
     }
 
     /**
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
index d220031bac..2c46a8db3a 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
@@ -749,6 +749,114 @@ public final class StructuralEncodingRules {
         return Utf8Result.AMBIGUOUS;
     }
 
+    /**
+     * Counts the number of malformed UTF-8 <em>sequences</em> in the sample —
+     * one event per bad lead, orphaned continuation, overlong, surrogate, or
+     * out-of-range codepoint, regardless of how many bytes the bad sequence
+     * spans.  Unlike {@link #checkUtf8}, this does not early-exit on the
+     * first bad sequence; it scans the entire range, resyncing after each
+     * error.  Returns 0 for a clean UTF-8 stream.
+     *
+     * <p>Useful for "tolerant" UTF-8 acceptance: a real-world UTF-8 file with
+     * a few corrupted sequences (copy-paste artefact, truncated upstream,
+     * MIME transport flip) should still be recognized as UTF-8 rather than
+     * rejected outright.  Caller decides what error count is tolerable
+     * (typically as a fraction of probe length).</p>
+     *
+     * <p>The count matches Java's {@code new String(bytes, UTF_8)}'s
+     * U+FFFD-per-error semantics (one replacement per malformed sequence).</p>
+     *
+     * @return number of malformed UTF-8 sequence events
+     */
+    public static int countUtf8Errors(byte[] bytes) {
+        return countUtf8Errors(bytes, 0, bytes.length);
+    }
+
+    public static int countUtf8Errors(byte[] bytes, int offset, int length) {
+        int errors = 0;
+        int i = offset;
+        int end = offset + length;
+        while (i < end) {
+            int b = bytes[i] & 0xFF;
+            if (b < 0x80) {
+                i++;
+                continue;
+            }
+            int seqLen;
+            if (b >= 0xF8) {
+                // 5-/6-byte sequences are not valid Unicode
+                errors++;
+                i++;
+                continue;
+            } else if (b >= 0xF0) {
+                seqLen = 4;
+            } else if (b >= 0xE0) {
+                seqLen = 3;
+            } else if (b >= 0xC0) {
+                seqLen = 2;
+            } else {
+                // continuation byte without a lead
+                errors++;
+                i++;
+                continue;
+            }
+            if (seqLen == 2 && b <= 0xC1) {
+                // overlong
+                errors++;
+                i++;
+                continue;
+            }
+            int kEnd = Math.min(seqLen, end - i);
+            // Truncated at probe-end is not an error — just stop here.
+            if (kEnd < seqLen) {
+                i = end;
+                break;
+            }
+            // Verify continuations are well-formed
+            boolean bad = false;
+            for (int k = 1; k < seqLen; k++) {
+                int cb = bytes[i + k] & 0xFF;
+                if (cb < 0x80 || cb > 0xBF) {
+                    bad = true;
+                    break;
+                }
+            }
+            if (bad) {
+                errors++;
+                // Skip the whole intended sequence. Advancing byte-by-byte
+                // would re-count the orphaned continuations as additional
+                // errors and inflate the count above Java's UTF-8 decoder's
+                // U+FFFD-per-event semantics, which is the convention we
+                // match for caller threshold comparisons.
+                i += seqLen;
+                continue;
+            }
+            // Codepoint range / surrogate checks
+            if (seqLen == 3) {
+                int cp = ((b & 0x0F) << 12)
+                        | ((bytes[i + 1] & 0xFF) & 0x3F) << 6
+                        | ((bytes[i + 2] & 0xFF) & 0x3F);
+                if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
+                    errors++;
+                    i += seqLen;
+                    continue;
+                }
+            } else if (seqLen == 4) {
+                int cp = ((b & 0x07) << 18)
+                        | ((bytes[i + 1] & 0xFF) & 0x3F) << 12
+                        | ((bytes[i + 2] & 0xFF) & 0x3F) << 6
+                        | ((bytes[i + 3] & 0xFF) & 0x3F);
+                if (cp < 0x10000 || cp > 0x10FFFF) {
+                    errors++;
+                    i += seqLen;
+                    continue;
+                }
+            }
+            i += seqLen;
+        }
+        return errors;
+    }
+
     // -----------------------------------------------------------------------
     //  Result type
     // -----------------------------------------------------------------------
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index fb57dd6f93..fb903f5b00 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -39,6 +39,7 @@ import org.apache.tika.ml.chardetect.HtmlByteStripper;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.quality.TextQualityComparison;
 import org.apache.tika.quality.TextQualityDetector;
+import org.apache.tika.quality.TextQualityScore;
 
 /**
  * A {@link MetaEncodingDetector} that arbitrates charset candidates by
@@ -75,6 +76,34 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
      * default read limit used by the charset base detectors. */
     private static final int DEFAULT_READ_LIMIT = 16384;
 
+    // ---------------------------------------------------------------------
+    // TACTICAL: declarative-override gate constants.
+    //
+    // These exist to compensate for known per-script calibration unevenness
+    // in the quality scorer (HAN noise floor too generous; MALAYALAM/TAMIL/
+    // BENGALI floors too strict).  They produce wrong tournaments when an
+    // honest in-document declaration (`<meta charset>` / XML decl) decodes
+    // to sparse non-Latin content that scores junky-but-correct, while a
+    // statistical pick decodes to dense mojibake-Han that scores decent-
+    // but-wrong.  See `analyses/2026-04-26-tika-eval-charset-and-other.md`
+    // and the indic-collapse + Korean+Hanja fixtures.
+    //
+    // REMOVE when the quality scorer is recalibrated per-script — the
+    // tournament should then be reliable on its own.
+    // ---------------------------------------------------------------------
+
+    /** Maximum delta in z-score units we tolerate before honoring the
+     *  in-document declaration over the tournament winner.  Tuned so that
+     *  small same-script-different-codepage deltas (windows-1252 vs
+     *  windows-1257 ≈ 1-2 units) don't trigger override when scripts
+     *  match, while indic-vs-mojibake-Han deltas (~3-5 units) do. */
+    private static final float DECLARATIVE_OVERRIDE_MAX_DELTA = 6.0f;
+
+    /** Maximum fraction of REPLACEMENT CHARACTER (U+FFFD) in the declared
+     *  decoder's output.  Above this, the declared charset clearly cannot
+     *  decode the bytes and we should not honor the declaration. */
+    private static final double DECLARATIVE_MAX_FFFD_RATE = 0.01;
+
     /** Cached quality detector.  {@code null} if none is on the classpath. */
     private final TextQualityDetector qualityDetector;
 
@@ -148,7 +177,10 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         byte[] stripDst = new byte[bytes.length];
         HtmlByteStripper.Result stripped =
                 HtmlByteStripper.strip(bytes, 0, bytes.length, stripDst, 0);
-        if (stripped.tagCount > 0 && stripped.length > 0) {
+        boolean stripUsed = stripped.tagCount > 0 && stripped.length > 0;
+        LOG.trace("junk-filter strip: input={}B tagCount={} stripped={}B 
used={}",
+                bytes.length, stripped.tagCount, stripped.length, stripUsed);
+        if (stripUsed) {
             forDecode = new byte[stripped.length];
             System.arraycopy(stripDst, 0, forDecode, 0, stripped.length);
         }
@@ -160,10 +192,22 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
             String decoded = safeDecode(forDecode, cs);
             if (decoded != null && !decoded.isEmpty()) {
                 candidates.put(cs, decoded);
+                if (LOG.isTraceEnabled()) {
+                    int sampleLen = Math.min(400, decoded.length());
+                    String sample = decoded.substring(0, sampleLen)
+                            .replace('\n', ' ').replace('\r', ' ');
+                    LOG.trace("junk-filter decoded {}: '{}{}' (len={})",
+                            cs.name(), sample,
+                            decoded.length() > sampleLen ? "…" : "",
+                            decoded.length());
+                }
+            } else {
+                LOG.trace("junk-filter decode {} -> null/empty", cs.name());
             }
         }
         if (candidates.size() <= 1) {
             // One or zero candidates produced usable text; nothing to compare.
+            LOG.trace("junk-filter <=1 usable candidate, abstaining");
             return Collections.emptyList();
         }
         // When a DECLARATIVE candidate decodes byte-identically to at least
@@ -175,6 +219,8 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         if (declared != null) {
             float conf = context.getTopConfidenceFor(declared);
             context.setArbitrationInfo("junk-filter-prefer-declarative");
+            LOG.trace("junk-filter -> {} (declarative with equivalent decode)",
+                    declared.name());
             return List.of(new EncodingResult(declared, conf));
         }
         if (allDecodingsIdentical(candidates)) {
@@ -182,6 +228,7 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
             // is present.  Text quality cannot distinguish them; defer to
             // the composite's default ordering.
             context.setArbitrationInfo("junk-filter-identical-decodings");
+            LOG.trace("junk-filter all decodings identical, deferring");
             return Collections.emptyList();
         }
 
@@ -189,21 +236,158 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         // every subsequent candidate challenges the current champion.
         Iterator<Map.Entry<Charset, String>> it = 
candidates.entrySet().iterator();
         Map.Entry<Charset, String> champion = it.next();
+        LOG.trace("junk-filter tournament seed: {}", champion.getKey().name());
         while (it.hasNext()) {
             Map.Entry<Charset, String> challenger = it.next();
             TextQualityComparison cmp = qualityDetector.compare(
                     champion.getKey().name(), champion.getValue(),
                     challenger.getKey().name(), challenger.getValue());
+            LOG.trace("junk-filter compare {} vs {} -> {} (delta={} A={} 
B={})",
+                    champion.getKey().name(), challenger.getKey().name(),
+                    cmp.winner(), String.format(java.util.Locale.ROOT, "%.3f", 
cmp.delta()),
+                    cmp.scoreA(), cmp.scoreB());
             if ("B".equals(cmp.winner())) {
                 champion = challenger;
             }
         }
+        LOG.trace("junk-filter -> {} (tournament champion)", 
champion.getKey().name());
+
+        // TACTICAL: declarative override.  See class-level comment block.
+        // REMOVE when quality scorer is recalibrated per-script.
+        Charset declarativeOverride = applyInDocumentDeclarativeOverride(
+                context, candidates, champion.getKey());
+        if (declarativeOverride != null) {
+            float conf = context.getTopConfidenceFor(declarativeOverride);
+            context.setArbitrationInfo("junk-filter-declarative-override");
+            LOG.trace("junk-filter -> {} (declarative override of tournament 
winner {})",
+                    declarativeOverride.name(), champion.getKey().name());
+            return List.of(new EncodingResult(declarativeOverride, conf));
+        }
 
         float confidence = context.getTopConfidenceFor(champion.getKey());
         context.setArbitrationInfo("junk-filter-selected");
         return List.of(new EncodingResult(champion.getKey(), confidence));
     }
 
+    /**
+     * Tactical fix: honor an in-document {@code <meta charset>} or XML
+     * declaration when the quality scorer's per-script calibration unevenness
+     * would otherwise mis-rank candidates of <em>different scripts</em>.
+     *
+     * <p>Returns the in-document declared charset to use, or {@code null} to
+     * leave the tournament winner intact.</p>
+     *
+     * <p>Gates (all must hold to override):</p>
+     * <ol>
+     *   <li><strong>(a) Decode is mostly clean</strong>: declared decoder 
produces
+     *       fewer than {@link #DECLARATIVE_MAX_FFFD_RATE} U+FFFD per 
char.</li>
+     *   <li><strong>(b) Both decoded</strong>: declared and tournament winner 
are
+     *       both in the candidate map (already guaranteed by upstream 
code).</li>
+     *   <li><strong>(c) Quality gap small</strong>: tournament winner's 
z-score
+     *       is not vastly higher than the declared's; specifically
+     *       {@code winner.z - declared.z &lt;= 
DECLARATIVE_OVERRIDE_MAX_DELTA}.</li>
+     *   <li><strong>(d) Different scripts</strong>: declared and winner 
classify
+     *       as different scripts.  Same-script Latin-cousin lies (e.g. 
windows-1252
+     *       declared on a windows-1257 file) fall through to the tournament,
+     *       which correctly handles them via byte-distribution scoring.</li>
+     * </ol>
+     *
+     * <p>"In-document" means {@code HtmlEncodingDetector} or any future 
XML-decl
+     * source — explicitly NOT {@code MetadataCharsetDetector} (outer 
Content-Type
+     * header), which is more often wrong.</p>
+     */
+    private Charset applyInDocumentDeclarativeOverride(
+            EncodingDetectorContext context,
+            Map<Charset, String> candidates,
+            Charset champion) {
+        Charset declared = findInDocumentDeclarative(context);
+        if (declared == null) {
+            return null;
+        }
+        if (declared.equals(champion)) {
+            return null; // already winning
+        }
+        // Per HTML5 spec, <meta charset> cannot validly declare UTF-16 / 
UTF-32:
+        // the meta tag itself is bytes that have to be parsed before its
+        // declaration is known, and UTF-16/32 require a BOM.  If the
+        // declaration claims UTF-16/32 and no BOM was found (BOMDetector runs
+        // first in the chain), we treat the declaration as invalid and let
+        // the tournament winner stand.  This catches govdocs1-style "utf-16
+        // declared on a Latin file" lies that would otherwise look like a
+        // legitimate script-mismatch override.
+        String declaredName = declared.name();
+        if (declaredName.startsWith("UTF-16") || 
declaredName.startsWith("UTF-32")) {
+            LOG.trace("junk-filter declarative-override skipped: UTF-16/32 in 
<meta> (HTML5 invalid)");
+            return null;
+        }
+        String championText = candidates.get(champion);
+        String declaredText = candidates.get(declared);
+        if (declaredText == null || championText == null) {
+            return null; // failed to decode
+        }
+        // (a) decode mostly clean
+        double fffdRate = replacementCharRate(declaredText);
+        if (fffdRate > DECLARATIVE_MAX_FFFD_RATE) {
+            LOG.trace("junk-filter declarative-override skipped: U+FFFD rate 
{} > {}",
+                    fffdRate, DECLARATIVE_MAX_FFFD_RATE);
+            return null;
+        }
+        TextQualityScore declaredScore = qualityDetector.score(declaredText);
+        TextQualityScore championScore = qualityDetector.score(championText);
+        // (c) winner not vastly higher
+        float delta = championScore.getZScore() - declaredScore.getZScore();
+        if (delta > DECLARATIVE_OVERRIDE_MAX_DELTA) {
+            LOG.trace("junk-filter declarative-override skipped: delta {} > 
{}",
+                    delta, DECLARATIVE_OVERRIDE_MAX_DELTA);
+            return null;
+        }
+        // (d) different scripts
+        String declaredScript = declaredScore.getDominantScript();
+        String championScript = championScore.getDominantScript();
+        if (declaredScript == null || declaredScript.equals(championScript)) {
+            LOG.trace("junk-filter declarative-override skipped: same script 
{}",
+                    declaredScript);
+            return null;
+        }
+        LOG.trace("junk-filter declarative-override fires: declared={} 
(script={}, z={}) vs winner={} (script={}, z={}) delta={}",
+                declared.name(), declaredScript, declaredScore.getZScore(),
+                champion.name(), championScript, championScore.getZScore(), 
delta);
+        return declared;
+    }
+
+    /**
+     * Find the first in-document DECLARATIVE candidate (from
+     * {@code HtmlEncodingDetector} / XML declaration), or {@code null}.
+     * Outer Content-Type metadata ({@code MetadataCharsetDetector}) is
+     * intentionally excluded — those headers lie too often.
+     */
+    private static Charset findInDocumentDeclarative(EncodingDetectorContext 
context) {
+        for (EncodingDetectorContext.Result r : context.getResults()) {
+            String name = r.getDetectorName();
+            if (("HtmlEncodingDetector".equals(name)
+                    || "StandardHtmlEncodingDetector".equals(name))
+                    && r.getResultType() == 
EncodingResult.ResultType.DECLARATIVE) {
+                return r.getCharset();
+            }
+        }
+        return null;
+    }
+
+    /** Fraction of {@code U+FFFD} (REPLACEMENT CHARACTER) in the decoded 
String —
+     * a proxy for "this charset cannot decode these bytes". */
+    private static double replacementCharRate(String s) {
+        if (s.isEmpty()) {
+            return 0.0;
+        }
+        long count = 0;
+        for (int i = 0; i < s.length(); i++) {
+            if (s.charAt(i) == '�') {
+                count++;
+            }
+        }
+        return (double) count / s.length();
+    }
+
     /**
      * Return the first DECLARATIVE charset whose decoded output equals at
      * least one other candidate's, or {@code null}.
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
new file mode 100644
index 0000000000..06427990cf
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.ml.chardetect.HtmlByteStripper;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * One-off probe: score a file's first 16KB under each candidate charset,
+ * with and without HTML-entity-ref expansion. Run via:
+ * {@code mvn -pl :tika-ml-junkdetect exec:java -Dexec.classpathScope=test
+ *        -Dexec.mainClass=org.apache.tika.ml.junkdetect.EntityRefProbe
+ *        -Dexec.args="<file> <charset1> [charset2] ..."}.
+ */
+public class EntityRefProbe {
+
+    private static final Pattern NUM_DEC =
+            Pattern.compile("&#(\\d{1,7});");
+    private static final Pattern NUM_HEX =
+            Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});");
+    // A small set of named refs likely to appear in HTML.
+    private static final Pattern NAMED =
+            Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+    public static void main(String[] args) throws Exception {
+        if (args.length < 2) {
+            System.err.println("usage: EntityRefProbe <file> <charset1> 
[charset2] ...");
+            System.exit(2);
+        }
+        byte[] raw = Files.readAllBytes(Paths.get(args[0]));
+        if (raw.length > 16384) {
+            byte[] cap = new byte[16384];
+            System.arraycopy(raw, 0, cap, 0, 16384);
+            raw = cap;
+        }
+        // Strip HTML the same way JunkFilterEncodingDetector does.
+        byte[] dst = new byte[raw.length];
+        HtmlByteStripper.Result strip =
+                HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
+        byte[] forDecode = raw;
+        if (strip.tagCount > 0 && strip.length > 0) {
+            forDecode = new byte[strip.length];
+            System.arraycopy(dst, 0, forDecode, 0, strip.length);
+        }
+        System.out.printf("input=%dB tagCount=%d stripped=%dB%n",
+                raw.length, strip.tagCount, forDecode.length);
+
+        JunkDetector jd = JunkDetector.loadFromClasspath();
+        for (int i = 1; i < args.length; i++) {
+            String csName = args[i];
+            Charset cs = Charset.forName(csName);
+            String decoded = new String(forDecode, cs);
+            String expanded = expandEntities(decoded);
+            String removed = removeEntities(decoded);
+            TextQualityScore rawScore = jd.score(decoded);
+            TextQualityScore expScore = jd.score(expanded);
+            TextQualityScore remScore = jd.score(removed);
+            System.out.println();
+            System.out.printf("== %s ==%n", csName);
+            System.out.printf("  raw       len=%-5d  %s%n", decoded.length(), 
rawScore);
+            System.out.printf("  expanded  len=%-5d  %s%n", expanded.length(), 
expScore);
+            System.out.printf("  removed   len=%-5d  %s%n", removed.length(), 
remScore);
+            int sample = Math.min(180, decoded.length());
+            System.out.printf("  raw      : %s…%n",
+                    decoded.substring(0, sample).replace('\n', ' 
').replace('\r', ' '));
+            sample = Math.min(180, expanded.length());
+            System.out.printf("  expanded : %s…%n",
+                    expanded.substring(0, sample).replace('\n', ' 
').replace('\r', ' '));
+            sample = Math.min(180, removed.length());
+            System.out.printf("  removed  : %s…%n",
+                    removed.substring(0, sample).replace('\n', ' 
').replace('\r', ' '));
+        }
+    }
+
+    private static String expandEntities(String s) {
+        StringBuilder out = new StringBuilder(s.length());
+        Matcher mDec = NUM_DEC.matcher(s);
+        StringBuilder buf = new StringBuilder();
+        // Decimal numeric refs
+        Matcher m = mDec;
+        int last = 0;
+        while (m.find()) {
+            buf.append(s, last, m.start());
+            try {
+                int cp = Integer.parseInt(m.group(1));
+                if (Character.isValidCodePoint(cp)) {
+                    buf.appendCodePoint(cp);
+                } else {
+                    buf.append(m.group());
+                }
+            } catch (NumberFormatException e) {
+                buf.append(m.group());
+            }
+            last = m.end();
+        }
+        buf.append(s, last, s.length());
+        String pass1 = buf.toString();
+
+        // Hex numeric refs
+        buf = new StringBuilder();
+        m = NUM_HEX.matcher(pass1);
+        last = 0;
+        while (m.find()) {
+            buf.append(pass1, last, m.start());
+            try {
+                int cp = Integer.parseInt(m.group(1), 16);
+                if (Character.isValidCodePoint(cp)) {
+                    buf.appendCodePoint(cp);
+                } else {
+                    buf.append(m.group());
+                }
+            } catch (NumberFormatException e) {
+                buf.append(m.group());
+            }
+            last = m.end();
+        }
+        buf.append(pass1, last, pass1.length());
+        String pass2 = buf.toString();
+
+        // A small set of named refs
+        return pass2
+                .replace("&amp;", "&")
+                .replace("&lt;", "<")
+                .replace("&gt;", ">")
+                .replace("&quot;", "\"")
+                .replace("&apos;", "'")
+                .replace("&nbsp;", " ")
+                .replace("&copy;", "©")
+                .replace("&reg;", "®");
+    }
+
+    /**
+     * Replace every numeric/named entity ref with a single space. Removal
+     * (rather than expansion) keeps the per-charset script signal clean —
+     * expansion injects Unicode codepoints that don't come from the candidate
+     * charset's bytes and can dominate the actual decoded-charset signal.
+     */
+    private static String removeEntities(String s) {
+        String r = NUM_DEC.matcher(s).replaceAll(" ");
+        r = NUM_HEX.matcher(r).replaceAll(" ");
+        r = NAMED.matcher(r).replaceAll(" ");
+        return r;
+    }
+}

(tika) branch main updated: charset and junk tweaks (#2794)

Reply via email to