(tika) 08/09: TIKA-4731 - whitespace cleanup

tallison Tue, 26 May 2026 12:18:48 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4731-common-script
in repository https://gitbox.apache.org/repos/asf/tika.git


commit ce713666d6dbc317e7af1b631b2008c533c8b694
Author: tallison <[email protected]>
AuthorDate: Mon May 25 09:27:34 2026 -0400

    TIKA-4731 - whitespace cleanup
---
 .../apache/tika/ml/junkdetect/JunkDetector.java    |  47 +++++-
 .../ml/junkdetect/JunkFilterEncodingDetector.java  |  92 +++++++++++-
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   | Bin 2324972 -> 2317582 
bytes
 .../apache/tika/ml/junkdetect/TraceJunkFilter.java | 166 ++++++++++++++++++++-
 4 files changed, 295 insertions(+), 10 deletions(-)

diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index 8eb8dab8a7..c8c1c90cac 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -1084,6 +1084,16 @@ public final class JunkDetector implements 
TextQualityDetector {
         return f1TablesByScript.get(script);
     }
 
+    /** Per-script z1 calibration {mu, sigma} (package-private, for 
diagnostics). */
+    float[] calibrationFor(String script) {
+        return calibrations.get(script);
+    }
+
+    /** Per-script F1 bigram tables view (package-private, for diagnostics). */
+    Map<String, BigramTables> f1TablesByScriptView() {
+        return f1TablesByScript;
+    }
+
     // -----------------------------------------------------------------------
     // Bucket-by-script bigram enumeration (the keystone).
     // Single source of truth for BOTH inference z1 scoring and training tally.
@@ -1113,6 +1123,31 @@ public final class JunkDetector implements 
TextQualityDetector {
         return Character.isDigit(cp);
     }
 
+    /** Whitespace-equivalents collapse to canonical U+0020 (' ') before bigram
+     *  lookup.  Tab, NBSP, ideographic space, line/paragraph separators, and
+     *  ASCII control chars (\\n, \\r, etc.) all signal the SAME thing about
+     *  surrounding script context — "end of word" — and shouldn't produce
+     *  distinct bigram entries.  Applied identically in training and inference
+     *  via {@link #forEachScriptBigram}, so the bigram table only ever sees
+     *  the canonical form.  The follow-on (space, space) drop in
+     *  {@link #forEachScriptBigram} then ensures collapsed whitespace runs
+     *  (HtmlContentCleaner indentation residue: \\t\\t, \\n\\r, NBSP·\\t)
+     *  contribute nothing to the COMMON bucket — only (script, space) and
+     *  (space, script) boundary bigrams survive. */
+    static int normalizeWhitespace(int cp) {
+        if (cp == 0x20) {
+            return cp;
+        }
+        int t = Character.getType(cp);
+        if (t == Character.CONTROL
+                || t == Character.SPACE_SEPARATOR
+                || t == Character.LINE_SEPARATOR
+                || t == Character.PARAGRAPH_SEPARATOR) {
+            return 0x20;
+        }
+        return cp;
+    }
+
     /**
      * Enumerates the script-bucketed bigrams of {@code cps} under the redesign
      * representation (see block comment above).  Used identically by inference
@@ -1149,9 +1184,15 @@ public final class JunkDetector implements 
TextQualityDetector {
             return;
         }
         for (int i = 0; i + 1 < cps.length; i++) {
-            int a = cps[i];
-            int b = cps[i + 1];
-            if (isSkipCodepoint(a) || isSkipCodepoint(b)) {
+            int a = normalizeWhitespace(cps[i]);
+            int b = normalizeWhitespace(cps[i + 1]);
+            // (space, space) drop short-circuits ahead of the digit check:
+            // after normalization, every whitespace run (\t\t, \n\r, NBSP·\t,
+            // HtmlContentCleaner indentation salads) collapses to this one
+            // bigram class — high count on HTML, zero script-boundary signal.
+            // (script, space) / (space, script) boundary bigrams survive.
+            if ((a == 0x20 && b == 0x20)
+                    || isSkipCodepoint(a) || isSkipCodepoint(b)) {
                 continue;
             }
             String ka = classKey(a);
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index 241b312a0f..056c768a65 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -73,6 +73,13 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
      * default read limit used by the charset base detectors. */
     private static final int DEFAULT_READ_LIMIT = 16384;
 
+    /** A STATISTICAL candidate at or below this confidence carries no real
+     *  signal — it's the "I don't know" level (matches Mojibuster's
+     *  windows-1252 fallback confidence).  When the statistical layer offers
+     *  nothing above this, the junk-filter defers to a DECLARATIVE/STRUCTURAL
+     *  anchor instead of arbitrating near-identical decodes by quality. */
+    private static final float NO_INFO_CONFIDENCE = 0.1f;
+
     /** Cached quality detector.  {@code null} if none is on the classpath. */
     private final TextQualityDetector qualityDetector;
 
@@ -206,15 +213,26 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         // address the same Masada-style whitespace-storm root cause for
         // every caller of JunkDetector and avoid the train/inference
         // distribution divergence that the strip introduced.
-        // The JunkDetector logit is already cross-script comparable (z1 is
-        // calibrated per script, z2..z9 are global), so arbitration is a plain
-        // argmax of the raw score — no per-script rescaling band-aid.
+        // The JunkDetector logit is cross-script comparable (z1 calibrated per
+        // script, z2..z9 global), so the base decision is a plain argmax of 
the
+        // raw score.  BUT the score is an ABSOLUTE per-decode quality, 
dominated
+        // by shared content (whitespace/digits identical across decodes); on a
+        // COMMON-dominated doc the discriminating bytes are diluted and the 
top
+        // candidates differ only by noise.  The quality signal is STATISTICAL-
+        // grade evidence, so it may override a higher-evidence anchor
+        // (DECLARATIVE author intent, or STRUCTURAL byte-grammar proof) only 
when
+        // it beats that anchor's score by OVERRIDE_MARGIN; otherwise we defer 
to
+        // the anchor.  This is honest low-confidence behaviour, not a 
tie-break:
+        // where the model has real signal (e.g. UTF-8 over garbage UTF-16, 
Δ≫1)
+        // it still overrides freely.
         Charset champion = null;
         double championZ = Double.NEGATIVE_INFINITY;
+        Map<Charset, Double> scoreByCharset = new LinkedHashMap<>();
         for (Map.Entry<Charset, String> entry : candidates.entrySet()) {
             org.apache.tika.quality.TextQualityScore sc =
                     qualityDetector.score(entry.getValue());
             float rawZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : 
sc.getZScore();
+            scoreByCharset.put(entry.getKey(), (double) rawZ);
             LOG.trace("junk-filter score {} z={} script={}",
                     entry.getKey().name(),
                     String.format(java.util.Locale.ROOT, "%.3f", rawZ),
@@ -229,6 +247,24 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
             // has no opinion, so keep the first (highest-confidence) 
candidate.
             champion = candidates.keySet().iterator().next();
         }
+
+        // "No-info" guard: if the statistical layer produced no confident
+        // answer — no STRUCTURAL proof, and its best STATISTICAL candidate is
+        // no better than Mojibuster's windows-1252 "I don't know" fallback
+        // (confidence <= NO_INFO_CONFIDENCE) — then it has nothing to say, so 
a
+        // DECLARATIVE/STRUCTURAL anchor (the author's declaration) should win
+        // rather than a quality argmax over near-identical decodes.  Fires 
ONLY
+        // when the statistical layer abstained, so it cannot cost the
+        // confident-detection wins (UTF-8 recovery etc.).
+        Charset anchor = bestAnchor(context, scoreByCharset);
+        if (anchor != null && !anchor.equals(champion)
+                && !hasConfidentNonDeclarative(context)) {
+            LOG.trace("junk-filter -> {} (defer to anchor; statistical layer 
gave "
+                            + "no confident answer, champion was {})",
+                    anchor.name(), champion.name());
+            context.setArbitrationInfo("junk-filter-defer-no-info");
+            return List.of(new EncodingResult(anchor, 
context.getTopConfidenceFor(anchor)));
+        }
         LOG.trace("junk-filter -> {} (argmax z={})",
                 champion.name(),
                 String.format(java.util.Locale.ROOT, "%.3f", championZ));
@@ -238,6 +274,56 @@ public class JunkFilterEncodingDetector implements 
MetaEncodingDetector {
         return List.of(new EncodingResult(champion, confidence));
     }
 
+    /**
+     * True if some detector produced a confident non-declarative signal: any
+     * STRUCTURAL result (byte-grammar proof), or any STATISTICAL result above
+     * {@link #NO_INFO_CONFIDENCE}.  When false, the statistical layer has
+     * effectively abstained (only its "I don't know" fallback), so a
+     * declaration should be trusted over a quality argmax.
+     */
+    private static boolean hasConfidentNonDeclarative(EncodingDetectorContext 
context) {
+        for (EncodingDetectorContext.Result r : context.getResults()) {
+            for (EncodingResult er : r.getEncodingResults()) {
+                EncodingResult.ResultType t = er.getResultType();
+                if (t == EncodingResult.ResultType.STRUCTURAL) {
+                    return true;
+                }
+                if (t == EncodingResult.ResultType.STATISTICAL
+                        && er.getConfidence() > NO_INFO_CONFIDENCE) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Highest-scoring DECLARATIVE/STRUCTURAL candidate present in the scored
+     * pool, or {@code null} if none.  This is the higher-evidence "anchor" the
+     * junk-filter defers to when the statistical layer gives no confident 
answer
+     * (see {@link #hasConfidentNonDeclarative}).
+     */
+    private static Charset bestAnchor(EncodingDetectorContext context,
+                                      Map<Charset, Double> scoreByCharset) {
+        Charset best = null;
+        double bestZ = Double.NEGATIVE_INFINITY;
+        for (EncodingDetectorContext.Result r : context.getResults()) {
+            for (EncodingResult er : r.getEncodingResults()) {
+                EncodingResult.ResultType t = er.getResultType();
+                if (t != EncodingResult.ResultType.DECLARATIVE
+                        && t != EncodingResult.ResultType.STRUCTURAL) {
+                    continue;
+                }
+                Double z = scoreByCharset.get(er.getCharset());
+                if (z != null && z > bestZ) {
+                    bestZ = z;
+                    best = er.getCharset();
+                }
+            }
+        }
+        return best;
+    }
+
     /**
      * Return the first DECLARATIVE charset whose decoded output equals at
      * least one other candidate's, or {@code null}.
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index 5a439399ce..c5a88fa093 100644
Binary files 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 and 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 differ
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java
index 6d1f553414..76cf6ce02c 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java
@@ -83,6 +83,8 @@ public final class TraceJunkFilter {
         boolean sumDiff = false;
         boolean skipSymbols = false;
         int headBytes = 0;
+        boolean contentCleaner = false;
+        boolean showBuckets = false;
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
                 case "--sum-diff":
@@ -94,6 +96,12 @@ public final class TraceJunkFilter {
                 case "--head-bytes":
                     headBytes = Integer.parseInt(args[++i]);
                     break;
+                case "--content-cleaner":
+                    contentCleaner = true;
+                    break;
+                case "--buckets":
+                    showBuckets = true;
+                    break;
                 case "--file":
                     files.add(resolvePath(args[++i]));
                     break;
@@ -151,7 +159,8 @@ public final class TraceJunkFilter {
             traceOne(file, detector, moji, fixedCharsets, sampleLen,
                     showFeatures, showScriptDist, showPerScriptRun,
                     entityModes, autoCandidates, showMojibuster,
-                    sumDiff, skipSymbols, headBytes);
+                    sumDiff, skipSymbols, headBytes, contentCleaner,
+                    showBuckets);
         }
     }
 
@@ -172,7 +181,8 @@ public final class TraceJunkFilter {
                                   boolean entityModes, boolean autoCandidates,
                                   boolean showMojibuster,
                                   boolean sumDiff, boolean skipSymbols,
-                                  int headBytes)
+                                  int headBytes, boolean contentCleaner,
+                                  boolean showBuckets)
             throws IOException {
         byte[] all = Files.readAllBytes(file);
         int limit = headBytes > 0 ? Math.min(headBytes, all.length) : 
READ_LIMIT;
@@ -227,8 +237,13 @@ public final class TraceJunkFilter {
         Map<String, String> decoded = new LinkedHashMap<>();
         Map<String, TextQualityScore> scores = new LinkedHashMap<>();
         for (Charset cs : charsets) {
-            String s = JunkFilterEncodingDetector.expandHtmlEntities(
-                    new String(forDecode, cs));
+            // --content-cleaner replicates the live chain exactly (decode the
+            // BOM-stripped probe, then HtmlContentCleaner.clean); the default
+            // path is the byte-strip diagnostic, which scores DIFFERENT text.
+            String s = contentCleaner
+                    ? HtmlContentCleaner.clean(new String(bytes, cs))
+                    : JunkFilterEncodingDetector.expandHtmlEntities(
+                            new String(forDecode, cs));
             decoded.put(cs.name(), s);
             scores.put(cs.name(), detector.score(s));
         }
@@ -350,6 +365,11 @@ public final class TraceJunkFilter {
             }
         }
 
+        if (showBuckets) {
+            printBucketReport(detector, decoded);
+            printCommonBucketHistogram(detector, decoded);
+        }
+
         if (sampleLen > 0) {
             System.out.println("  decoded samples (first " + sampleLen + " 
chars):");
             for (String cs : decoded.keySet()) {
@@ -473,6 +493,144 @@ public final class TraceJunkFilter {
         }
     }
 
+    /** Per-bucket z1 breakdown — counts, raw means, calibration (mu,sigma),
+     *  calibrated bucket-z, and the (count * z) contribution to the doc-z1.
+     *  Probes the (B) "monoscript-collapse" hypothesis: which bucket(s) on
+     *  which decode drag the count-weighted aggregate? */
+    private static void printBucketReport(JunkDetector det,
+                                          Map<String, String> decoded) {
+        System.out.println("  per-bucket z1 breakdown:");
+        for (Map.Entry<String, String> ent : decoded.entrySet()) {
+            String cs = ent.getKey();
+            String text = java.text.Normalizer.normalize(
+                    ent.getValue(), java.text.Normalizer.Form.NFC);
+            int[] cps = text.codePoints().toArray();
+            Map<String, double[]> buckets =
+                    JunkDetector.bucketSumsAndCounts(
+                            cps, det.f1TablesByScriptView());
+            // Aggregate doc z1 ourselves to verify against 
scoreWithFeatureComponents.
+            double weightedSum = 0;
+            long totalCount = 0;
+            System.out.println("    " + cs + ":");
+            System.out.printf(Locale.ROOT,
+                    "      %-14s %7s %9s %9s %9s %9s %9s%n",
+                    "bucket", "count", "rawMean", "mu", "sigma", "z", "ct*z");
+            // Sort by count descending for readability
+            List<Map.Entry<String, double[]>> rows =
+                    new ArrayList<>(buckets.entrySet());
+            rows.sort((a, b) -> Double.compare(b.getValue()[1], 
a.getValue()[1]));
+            for (Map.Entry<String, double[]> e : rows) {
+                String script = e.getKey();
+                long cnt = (long) e.getValue()[1];
+                if (cnt == 0) continue;
+                double rawMean = e.getValue()[0] / cnt;
+                float[] cal = det.calibrationFor(script);
+                double mu = cal == null ? Double.NaN : cal[0];
+                double sigma = cal == null ? Double.NaN : cal[1];
+                double z = cal == null ? Double.NaN : (rawMean - mu) / sigma;
+                double ctz = z * cnt;
+                if (!Double.isNaN(z)) {
+                    weightedSum += ctz;
+                    totalCount += cnt;
+                }
+                System.out.printf(Locale.ROOT,
+                        "      %-14s %7d %+9.3f %+9.3f %9.3f %+9.3f %+9.1f%n",
+                        script, cnt, rawMean, mu, sigma, z, ctz);
+            }
+            if (totalCount > 0) {
+                double docZ1 = weightedSum / totalCount;
+                System.out.printf(Locale.ROOT,
+                        "      %-14s %7d %9s %9s %9s %+9.3f  (sum-ctz/N = doc 
z1)%n",
+                        "TOTAL", totalCount, "", "", "", docZ1);
+            }
+        }
+    }
+
+    /** Histogram of bigrams that land in the COMMON bucket, broken down by
+     *  the Unicode general-category-letter pair (e.g. "Zs·Zs" = space·space).
+     *  Probes whether COMMON is dominated by whitespace HTML residue. */
+    private static void printCommonBucketHistogram(JunkDetector det,
+                                                   Map<String, String> 
decoded) {
+        System.out.println("  COMMON bucket bigram categories (top 12 by 
count):");
+        for (Map.Entry<String, String> ent : decoded.entrySet()) {
+            String cs = ent.getKey();
+            String text = java.text.Normalizer.normalize(
+                    ent.getValue(), java.text.Normalizer.Form.NFC);
+            int[] cps = text.codePoints().toArray();
+            BigramTables tCommon = det.f1TablesFor(JunkDetector.COMMON_SCRIPT);
+            Map<String, long[]> hist = new java.util.HashMap<>(); // catPair 
-> {count, sumLogP}
+            long commonBigrams = 0;
+            double commonSum = 0;
+            for (int i = 0; i + 1 < cps.length; i++) {
+                int a = cps[i], b = cps[i + 1];
+                if (Character.isDigit(a) || Character.isDigit(b)) continue;
+                String ka = JunkDetector.classKey(a);
+                String kb = JunkDetector.classKey(b);
+                if (!JunkDetector.COMMON_SCRIPT.equals(ka)
+                        || !JunkDetector.COMMON_SCRIPT.equals(kb)) {
+                    continue;
+                }
+                String pair = catLabel(a) + "·" + catLabel(b);
+                long[] row = hist.computeIfAbsent(pair, k -> new long[2]);
+                row[0]++;
+                if (tCommon != null) {
+                    double lp = JunkDetector.computeF1MeanLogP(
+                            new int[]{a, b}, tCommon);
+                    if (!Double.isNaN(lp)) {
+                        commonSum += lp;
+                        commonBigrams++;
+                        // Encode sum-lp ×1000 in row[1] for an int aggregate.
+                        row[1] += (long) (lp * 1000);
+                    }
+                }
+            }
+            System.out.println("    " + cs + ":  total=" + commonBigrams
+                    + "  rawMean=" + String.format(Locale.ROOT, "%.3f",
+                            commonBigrams == 0 ? 0 : commonSum / 
commonBigrams));
+            List<Map.Entry<String, long[]>> rows =
+                    new ArrayList<>(hist.entrySet());
+            rows.sort((x, y) -> Long.compare(y.getValue()[0], 
x.getValue()[0]));
+            int shown = 0;
+            for (Map.Entry<String, long[]> e : rows) {
+                if (shown++ >= 12) break;
+                long cnt = e.getValue()[0];
+                double meanLp = cnt == 0 ? 0
+                        : (e.getValue()[1] / 1000.0) / cnt;
+                double pct = 100.0 * cnt / Math.max(1, commonBigrams);
+                System.out.printf(Locale.ROOT,
+                        "      %-12s %7d (%5.1f%%)   meanLp=%+8.3f%n",
+                        e.getKey(), cnt, pct, meanLp);
+            }
+        }
+    }
+
+    /** Short category label for diagnostic output: Zs=space-sep, Zl=line-sep,
+     *  Po=other-punct, Pd=dash-punct, Ps=open, Pe=close, Sm=math, Sc=currency,
+     *  Sk=modifier-sym, So=other-sym, Cc=control, Cf=format, etc. */
+    private static String catLabel(int cp) {
+        int t = Character.getType(cp);
+        switch (t) {
+            case Character.SPACE_SEPARATOR:       return "Zs";
+            case Character.LINE_SEPARATOR:        return "Zl";
+            case Character.PARAGRAPH_SEPARATOR:   return "Zp";
+            case Character.CONTROL:               return "Cc";
+            case Character.FORMAT:                return "Cf";
+            case Character.CONNECTOR_PUNCTUATION: return "Pc";
+            case Character.DASH_PUNCTUATION:      return "Pd";
+            case Character.START_PUNCTUATION:     return "Ps";
+            case Character.END_PUNCTUATION:       return "Pe";
+            case Character.INITIAL_QUOTE_PUNCTUATION: return "Pi";
+            case Character.FINAL_QUOTE_PUNCTUATION:   return "Pf";
+            case Character.OTHER_PUNCTUATION:     return "Po";
+            case Character.MATH_SYMBOL:           return "Sm";
+            case Character.CURRENCY_SYMBOL:       return "Sc";
+            case Character.MODIFIER_SYMBOL:       return "Sk";
+            case Character.OTHER_SYMBOL:          return "So";
+            default:
+                return String.format(Locale.ROOT, "?%d", t);
+        }
+    }
+
     private static void printPerScriptRun(JunkDetector det, String text) {
         Map<String, StringBuilder> byScript = new TreeMap<>();
         int i = 0;

(tika) 08/09: TIKA-4731 - whitespace cleanup

Reply via email to