This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4744-nb-cohort-cap
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4969062d15b508dc69e7a910a23d0d142caa5bfe
Author: tallison <[email protected]>
AuthorDate: Thu May 28 14:34:40 2026 -0400

    add cohort-specific caps
---
 .skills/tika-eval-encoding-regression.md           | 171 +++++++++++++++++++++
 .../NaiveBayesBigramEncodingDetector.java          | 108 ++++++++++---
 2 files changed, 256 insertions(+), 23 deletions(-)

diff --git a/.skills/tika-eval-encoding-regression.md 
b/.skills/tika-eval-encoding-regression.md
new file mode 100644
index 0000000000..148acd05df
--- /dev/null
+++ b/.skills/tika-eval-encoding-regression.md
@@ -0,0 +1,171 @@
+# tika-eval for encoding-detector regression hunts
+
+A condensed pattern for finding SBCS→CJK style charset-detector regressions
+(or any "A picks encoding X, B picks encoding Y" question) without
+building two tika-app distributions.
+
+## Two configs, one build
+
+Encoding-detector experiments don't need a "before" and "after" tika-app —
+the chain composition is per-config. Run the SAME tika-app twice against
+two configs, treat the outputs as `-a` and `-b`. Much faster than
+`tika-eval-compare`'s two-build flow.
+
+```bash
+# build once
+./mvnw clean install -pl tika-app -am -Pfast -DskipTests \
+  -Dmaven.repo.local=$(pwd)/.local_m2_repo
+unzip -q tika-app/target/tika-app-*.zip -d /tmp/tika-app-current
+
+# two configs (any combination of detectors)
+java -jar /tmp/tika-app-current/tika-app-*.jar \
+  --config=tika-config-3x-default.json \
+  -i <corpus> -o ~/data/extracts/A -n 6
+java -jar /tmp/tika-app-current/tika-app-*.jar \
+  --config=tika-config-junkfilter-combiner.json \
+  -i <corpus> -o ~/data/extracts/B -n 6
+
+# normal Compare
+java -jar /tmp/tika-eval-current/tika-eval-app-*.jar Compare \
+  -a ~/data/extracts/A -b ~/data/extracts/B -d ~/data/extracts/A-vs-B -r -rd 
~/data/extracts/A-vs-B-reports
+```
+
+### Canonical 3.x-default encoding chain config
+
+```json
+{
+  "encoding-detectors": [
+    {"html-encoding-detector": {}},
+    {"universal-encoding-detector": {}},
+    {"icu4j-encoding-detector": {}}
+  ]
+}
+```
+
+Existing copy: `~/data/claude-work/tika-config-3x-default.json`.
+
+### Canonical 4.x junkfilter chain config
+
+```json
+{
+  "encoding-detectors": [
+    {"bom-detector": {}},
+    {"html-encoding-detector": {}},
+    {"mojibuster-encoding-detector": {}},
+    {"junk-filter-encoding-detector": {}}
+  ]
+}
+```
+
+Existing copy: 
`~/data/smoke/eval-runtime/tika-config-junkfilter-combiner.json`.
+
+### Per-detector isolation configs
+
+Each detector wired alone lives in `~/data/commoncrawl/cc-html-eval/configs/`:
+`tika-config-bom.json`, `tika-config-html.json`, 
`tika-config-htmlstandard.json`,
+`tika-config-universal.json`, `tika-config-icu4j.json`,
+`tika-config-mojibuster.json`, `tika-config-junkfilter-chain.json`.
+Use these for chain-attribution work (which detector did the detection).
+
+## Encoding-pair flip query
+
+`MIMES.MIME_STRING` for text-y mimes is `text/html; charset=X` form. Extract
+the charset with a regex split, group by `(enc_a, enc_b)`, filter pairs.
+A=before/`-a`, B=after/`-b`; join on `pa.ID = pb.ID` (paired by id).
+
+```sql
+SELECT
+  REGEXP_REPLACE(ma.MIME_STRING, '^.*charset=', '') AS enc_a,
+  REGEXP_REPLACE(mb.MIME_STRING, '^.*charset=', '') AS enc_b,
+  COUNT(*) n,
+  SUM(cb.NUM_COMMON_TOKENS - ca.NUM_COMMON_TOKENS) AS delta_common
+FROM PROFILES_A pa
+JOIN PROFILES_B pb ON pa.ID = pb.ID
+JOIN MIMES ma ON pa.MIME_ID = ma.MIME_ID
+JOIN MIMES mb ON pb.MIME_ID = mb.MIME_ID
+JOIN CONTENTS_A ca ON ca.ID = pa.ID
+JOIN CONTENTS_B cb ON cb.ID = pb.ID
+WHERE ma.MIME_STRING LIKE '%charset=%' AND mb.MIME_STRING LIKE '%charset=%'
+  AND REGEXP_REPLACE(ma.MIME_STRING, '^.*charset=', '') <>
+      REGEXP_REPLACE(mb.MIME_STRING, '^.*charset=', '')
+GROUP BY enc_a, enc_b
+ORDER BY n DESC, delta_common ASC LIMIT 50;
+```
+
+Add an `IN (...)` filter on either side to constrain to a family
+(e.g. SBCS-Western → CJK):
+
+```sql
+  AND REGEXP_REPLACE(ma.MIME_STRING,'^.*charset=','')
+      IN ('windows-1252','ISO-8859-1','ISO-8859-15','ISO-8859-2','ISO-8859-3',
+          'windows-1250','windows-1254','windows-1257','ISO-8859-13',
+          'windows-1258','x-MacRoman','IBM850','IBM852')
+  AND REGEXP_REPLACE(mb.MIME_STRING,'^.*charset=','')
+      IN ('GB18030','GBK','GB2312','Big5','Big5-HKSCS','Shift_JIS','EUC-JP',
+          'EUC-KR','x-EUC-TW','x-windows-874','x-windows-949',
+          'ISO-2022-JP','ISO-2022-KR','ISO-2022-CN')
+```
+
+### Per-file drilldown
+
+Join `CONTAINERS` to get the source path; pull `LANG_ID_1` from both sides
+to see whether language detection agrees the content is Western while the
+charset has flipped to CJK (the regression's defining shape):
+
+```sql
+SELECT ct.FILE_PATH,
+       REGEXP_REPLACE(ma.MIME_STRING,'^.*charset=','') AS enc_a,
+       REGEXP_REPLACE(mb.MIME_STRING,'^.*charset=','') AS enc_b,
+       ca.NUM_COMMON_TOKENS AS ca_tok, cb.NUM_COMMON_TOKENS AS cb_tok,
+       cb.NUM_COMMON_TOKENS - ca.NUM_COMMON_TOKENS AS delta,
+       ca.LANG_ID_1 AS lang_a, cb.LANG_ID_1 AS lang_b
+FROM PROFILES_A pa JOIN PROFILES_B pb ON pa.ID = pb.ID
+JOIN MIMES ma ON pa.MIME_ID = ma.MIME_ID JOIN MIMES mb ON pb.MIME_ID = 
mb.MIME_ID
+JOIN CONTENTS_A ca ON ca.ID = pa.ID JOIN CONTENTS_B cb ON cb.ID = pb.ID
+JOIN CONTAINERS ct ON ct.CONTAINER_ID = pa.CONTAINER_ID
+WHERE <enc_a/enc_b filter as above>
+ORDER BY delta ASC LIMIT 15;
+```
+
+## Per-file detector attribution (`X-TIKA:encodingDetectionTrace`)
+
+Every JSON extract from a chain with multiple detectors carries
+`X-TIKA:encodingDetectionTrace` in metadata. It's a per-detector emission
+log with the META detector's arbitration tag at the end:
+
+```
+MojibusterEncodingDetector->Shift_JIS[STATISTICAL](1.00) [junk-filter-selected]
+```
+
+When investigating "why did B pick X for this file?", read this trace first
+— it tells you which base detector(s) emitted candidates and which one the
+meta detector chose. If the trace shows ONLY Mojibuster firing with a CJK
+pick, the bug is in Mojibuster's emission (pool too narrow), not in
+JunkFilter's arbitration.
+
+`X-TIKA:encodingDetector` is the simple-name credit string;
+`X-TIKA:detectedEncoding` is the final answer (also in `Content-Encoding`).
+
+## Reproducing a single-file detection without a full chain
+
+```bash
+./mvnw -q -pl tika-ml/tika-ml-junkdetect 
-Dmaven.repo.local=$(pwd)/.local_m2_repo \
+  -Dexec.classpathScope=test \
+  -Dexec.mainClass=org.apache.tika.ml.junkdetect.TraceJunkFilter \
+  -Dexec.args="--file <path> --auto-candidates --content-cleaner --head-bytes 
524288 --sample 120" \
+  exec:java
+```
+
+Key flags:
+- `--auto-candidates` — use Mojibuster's per-file pool as the candidate set
+- `--content-cleaner` — decode each candidate then run text through
+  `HtmlContentCleaner` to match the live chain
+- `--head-bytes 524288` — read up to 512 KB raw to match
+  `AdaptiveProbe.DEFAULT_RAW_CAP`. The default `READ_LIMIT` of 16 KB will
+  give a *different* probe than the live chain on long markup-heavy pages
+  and lead you to disagree with the live chain's pick. Always pass this
+  when reconciling a TraceJunkFilter run with a live extract.
+
+Without `--head-bytes`, you are looking at a different probe than the
+chain saw — this is the most common source of "trace says X, chain
+says Y" confusion.
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index 2460656f0c..5e87150fb6 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -26,7 +26,9 @@ import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.commons.io.IOUtils;
 
@@ -104,23 +106,15 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
     public static final double MARGIN_THRESHOLD_NATS_PER_BIGRAM = 0.20;
 
     /**
-     * Per-bigram cross-class total-contribution cap (Type C clipping).
-     * For each distinct bigram in the probe, the top-scoring class's
-     * total contribution (count × logP × idf, after dequantization) is
-     * capped at the runner-up class's contribution + this many nats.
-     *
-     * <p>Defends against corpus-skew pathologies where one class
-     * accumulates extreme bigram mass that swings classification on
-     * one or two byte-pairs alone (e.g., Czech "ČR" digraph in
-     * ISO-8859-2 contributing +186 nats over win-1252 on Italian text).
-     * Length-invariant by construction: the cap is on per-bigram
-     * advantage, regardless of how many times the bigram appears.</p>
-     *
-     * <p>20 nats = e^20 ≈ 5×10^8 probability-ratio advantage per
-     * bigram — preserves legitimate CJK-vs-Latin and other cross-script
-     * signal while bounding the diffuse-corpus-skew tail.</p>
+     * Per-distinct-bigram cap: top-scoring class's contribution is
+     * clipped to the best <em>cross-cohort</em> class's contribution +
+     * this many nats.  Bounds both single-bigram corpus skew and the
+     * diffuse coverage asymmetry where broad-vocab cohorts (CJK,
+     * EBCDIC) collectively swamp narrow-vocab cohorts (LATIN) on
+     * rare-ASCII bigrams that fall to the unseen floor in the narrow
+     * cohort.  See {@link Cohort}.
      */
-    public static final double CAP_PER_BIGRAM_NATS = 20.0;
+    public static final double CAP_PER_BIGRAM_NATS = 10.0;
 
     /**
      * Minimum distinct bigrams required before the per-bigram cap
@@ -149,9 +143,60 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
      */
     public static final int MIN_BIGRAMS_FOR_DIVERSITY_GATE = 100;
 
+    /**
+     * Script / writing-system family used by {@link #CAP_PER_BIGRAM_NATS}.
+     * UTF-8 stands alone so the cap engages on UTF-vs-anything pairs
+     * (UTF-8 misread as win-1252 or as GBK).
+     */
+    public enum Cohort {
+        LATIN, CJK, CYRILLIC, GREEK, HEBREW, ARABIC, THAI, EBCDIC, UTF
+    }
+
+    /**
+     * Class label → cohort.  Must cover every NB-model label; load
+     * fails fast on an unmapped label (model and code travel together
+     * in git, no BWC layer).
+     */
+    private static final Map<String, Cohort> COHORT_TABLE = buildCohortTable();
+
+    private static Map<String, Cohort> buildCohortTable() {
+        Map<String, Cohort> m = new HashMap<>();
+        for (String label : new String[]{
+                "windows-1252", "windows-1250", "windows-1254", "windows-1257",
+                "windows-1258", "ISO-8859-2", "ISO-8859-3", "ISO-8859-16",
+                "x-MacRoman", "IBM850", "IBM852"}) {
+            m.put(label, Cohort.LATIN);
+        }
+        for (String label : new String[]{
+                "Big5-HKSCS", "EUC-JP", "GB18030", "Shift_JIS",
+                "x-EUC-TW", "x-windows-949"}) {
+            m.put(label, Cohort.CJK);
+        }
+        for (String label : new String[]{
+                "windows-1251", "KOI8-R", "KOI8-U", "IBM855", "IBM866",
+                "x-mac-cyrillic"}) {
+            m.put(label, Cohort.CYRILLIC);
+        }
+        m.put("windows-1253", Cohort.GREEK);
+        m.put("windows-1255", Cohort.HEBREW);
+        m.put("windows-1256", Cohort.ARABIC);
+        m.put("windows-874", Cohort.THAI);
+        // Bidi-suffix variants (-ltr/-rtl) share a cohort; toJavaCharsetName
+        // collapses them at Charset lookup, but their bigram tables differ.
+        for (String label : new String[]{
+                "IBM1047", "IBM500", "IBM420-ltr", "IBM420-rtl",
+                "IBM424-ltr", "IBM424-rtl"}) {
+            m.put(label, Cohort.EBCDIC);
+        }
+        m.put("UTF-8", Cohort.UTF);
+        return Collections.unmodifiableMap(m);
+    }
+
     private final String[] labels;
     /** Charset objects cached at load — one {@code Charset.forName} per 
class, ever. */
     private final Charset[] charsets;
+    /** Per-class cohort, parallel to {@link #labels}. */
+    private final Cohort[] cohorts;
     /**
      * Bigram-major int8 logP layout.  Quantized at load time via
      * per-class scale {@code scale[c] = maxAbs(class c's logP column) / 127}.
@@ -198,6 +243,7 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
             this.numClasses = dis.readInt();
             this.labels = new String[numClasses];
             this.charsets = new Charset[numClasses];
+            this.cohorts = new Cohort[numClasses];
 
             // Read quantized IDF table + scale.
             float idfScale = dis.readFloat();
@@ -228,6 +274,14 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                     cs = null;
                 }
                 charsets[c] = cs;
+                Cohort cohort = COHORT_TABLE.get(labels[c]);
+                if (cohort == null) {
+                    throw new IOException(
+                            "NB model class label \"" + labels[c]
+                                    + "\" has no cohort assignment; "
+                                    + "update 
NaiveBayesBigramEncodingDetector.COHORT_TABLE.");
+                }
+                cohorts[c] = cohort;
 
                 scale[c] = dis.readFloat();
                 unseenQ[c] = dis.readByte();
@@ -454,21 +508,29 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
             }
 
             // logPs are negative; "best" class for the bigram = highest
-            // (least negative) contribution after dequant.
+            // (least negative) contribution after dequant.  Cap reference
+            // is the best contribution from a class outside top-1's
+            // cohort, so the cap engages on cross-cohort gaps that a
+            // max-vs-overall-runner-up cap missed when multiple classes
+            // in top-1's cohort sat close together.
+            int topClass = -1;
             double max = Double.NEGATIVE_INFINITY;
-            double secondMax = Double.NEGATIVE_INFINITY;
             for (int c = 0; c < numClasses; c++) {
                 double contrib = logP8[base + c] * countTimesIdf * 
perClassDequant[c];
                 contributions[c] = contrib;
                 if (contrib > max) {
-                    secondMax = max;
                     max = contrib;
-                } else if (contrib > secondMax) {
-                    secondMax = contrib;
+                    topClass = c;
+                }
+            }
+            Cohort topCohort = cohorts[topClass];
+            double bestCrossCohort = Double.NEGATIVE_INFINITY;
+            for (int c = 0; c < numClasses; c++) {
+                if (cohorts[c] != topCohort && contributions[c] > 
bestCrossCohort) {
+                    bestCrossCohort = contributions[c];
                 }
             }
-            // Cap any class whose contribution exceeds runner-up + cap.
-            double capValue = secondMax + CAP_PER_BIGRAM_NATS;
+            double capValue = bestCrossCohort + CAP_PER_BIGRAM_NATS;
             if (max > capValue) {
                 for (int c = 0; c < numClasses; c++) {
                     if (contributions[c] > capValue) {

Reply via email to