This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4731-common-script in repository https://gitbox.apache.org/repos/asf/tika.git
commit ce713666d6dbc317e7af1b631b2008c533c8b694 Author: tallison <[email protected]> AuthorDate: Mon May 25 09:27:34 2026 -0400 TIKA-4731 - whitespace cleanup --- .../apache/tika/ml/junkdetect/JunkDetector.java | 47 +++++- .../ml/junkdetect/JunkFilterEncodingDetector.java | 92 +++++++++++- .../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2324972 -> 2317582 bytes .../apache/tika/ml/junkdetect/TraceJunkFilter.java | 166 ++++++++++++++++++++- 4 files changed, 295 insertions(+), 10 deletions(-) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java index 8eb8dab8a7..c8c1c90cac 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java @@ -1084,6 +1084,16 @@ public final class JunkDetector implements TextQualityDetector { return f1TablesByScript.get(script); } + /** Per-script z1 calibration {mu, sigma} (package-private, for diagnostics). */ + float[] calibrationFor(String script) { + return calibrations.get(script); + } + + /** Per-script F1 bigram tables view (package-private, for diagnostics). */ + Map<String, BigramTables> f1TablesByScriptView() { + return f1TablesByScript; + } + // ----------------------------------------------------------------------- // Bucket-by-script bigram enumeration (the keystone). // Single source of truth for BOTH inference z1 scoring and training tally. @@ -1113,6 +1123,31 @@ public final class JunkDetector implements TextQualityDetector { return Character.isDigit(cp); } + /** Whitespace-equivalents collapse to canonical U+0020 (' ') before bigram + * lookup. Tab, NBSP, ideographic space, line/paragraph separators, and + * ASCII control chars (\\n, \\r, etc.) all signal the SAME thing about + * surrounding script context — "end of word" — and shouldn't produce + * distinct bigram entries. Applied identically in training and inference + * via {@link #forEachScriptBigram}, so the bigram table only ever sees + * the canonical form. The follow-on (space, space) drop in + * {@link #forEachScriptBigram} then ensures collapsed whitespace runs + * (HtmlContentCleaner indentation residue: \\t\\t, \\n\\r, NBSP·\\t) + * contribute nothing to the COMMON bucket — only (script, space) and + * (space, script) boundary bigrams survive. */ + static int normalizeWhitespace(int cp) { + if (cp == 0x20) { + return cp; + } + int t = Character.getType(cp); + if (t == Character.CONTROL + || t == Character.SPACE_SEPARATOR + || t == Character.LINE_SEPARATOR + || t == Character.PARAGRAPH_SEPARATOR) { + return 0x20; + } + return cp; + } + /** * Enumerates the script-bucketed bigrams of {@code cps} under the redesign * representation (see block comment above). Used identically by inference @@ -1149,9 +1184,15 @@ public final class JunkDetector implements TextQualityDetector { return; } for (int i = 0; i + 1 < cps.length; i++) { - int a = cps[i]; - int b = cps[i + 1]; - if (isSkipCodepoint(a) || isSkipCodepoint(b)) { + int a = normalizeWhitespace(cps[i]); + int b = normalizeWhitespace(cps[i + 1]); + // (space, space) drop short-circuits ahead of the digit check: + // after normalization, every whitespace run (\t\t, \n\r, NBSP·\t, + // HtmlContentCleaner indentation salads) collapses to this one + // bigram class — high count on HTML, zero script-boundary signal. + // (script, space) / (space, script) boundary bigrams survive. + if ((a == 0x20 && b == 0x20) + || isSkipCodepoint(a) || isSkipCodepoint(b)) { continue; } String ka = classKey(a); diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java index 241b312a0f..056c768a65 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java @@ -73,6 +73,13 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { * default read limit used by the charset base detectors. */ private static final int DEFAULT_READ_LIMIT = 16384; + /** A STATISTICAL candidate at or below this confidence carries no real + * signal — it's the "I don't know" level (matches Mojibuster's + * windows-1252 fallback confidence). When the statistical layer offers + * nothing above this, the junk-filter defers to a DECLARATIVE/STRUCTURAL + * anchor instead of arbitrating near-identical decodes by quality. */ + private static final float NO_INFO_CONFIDENCE = 0.1f; + /** Cached quality detector. {@code null} if none is on the classpath. */ private final TextQualityDetector qualityDetector; @@ -206,15 +213,26 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { // address the same Masada-style whitespace-storm root cause for // every caller of JunkDetector and avoid the train/inference // distribution divergence that the strip introduced. - // The JunkDetector logit is already cross-script comparable (z1 is - // calibrated per script, z2..z9 are global), so arbitration is a plain - // argmax of the raw score — no per-script rescaling band-aid. + // The JunkDetector logit is cross-script comparable (z1 calibrated per + // script, z2..z9 global), so the base decision is a plain argmax of the + // raw score. BUT the score is an ABSOLUTE per-decode quality, dominated + // by shared content (whitespace/digits identical across decodes); on a + // COMMON-dominated doc the discriminating bytes are diluted and the top + // candidates differ only by noise. The quality signal is STATISTICAL- + // grade evidence, so it may override a higher-evidence anchor + // (DECLARATIVE author intent, or STRUCTURAL byte-grammar proof) only when + // it beats that anchor's score by OVERRIDE_MARGIN; otherwise we defer to + // the anchor. This is honest low-confidence behaviour, not a tie-break: + // where the model has real signal (e.g. UTF-8 over garbage UTF-16, Δ≫1) + // it still overrides freely. Charset champion = null; double championZ = Double.NEGATIVE_INFINITY; + Map<Charset, Double> scoreByCharset = new LinkedHashMap<>(); for (Map.Entry<Charset, String> entry : candidates.entrySet()) { org.apache.tika.quality.TextQualityScore sc = qualityDetector.score(entry.getValue()); float rawZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : sc.getZScore(); + scoreByCharset.put(entry.getKey(), (double) rawZ); LOG.trace("junk-filter score {} z={} script={}", entry.getKey().name(), String.format(java.util.Locale.ROOT, "%.3f", rawZ), @@ -229,6 +247,24 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { // has no opinion, so keep the first (highest-confidence) candidate. champion = candidates.keySet().iterator().next(); } + + // "No-info" guard: if the statistical layer produced no confident + // answer — no STRUCTURAL proof, and its best STATISTICAL candidate is + // no better than Mojibuster's windows-1252 "I don't know" fallback + // (confidence <= NO_INFO_CONFIDENCE) — then it has nothing to say, so a + // DECLARATIVE/STRUCTURAL anchor (the author's declaration) should win + // rather than a quality argmax over near-identical decodes. Fires ONLY + // when the statistical layer abstained, so it cannot cost the + // confident-detection wins (UTF-8 recovery etc.). + Charset anchor = bestAnchor(context, scoreByCharset); + if (anchor != null && !anchor.equals(champion) + && !hasConfidentNonDeclarative(context)) { + LOG.trace("junk-filter -> {} (defer to anchor; statistical layer gave " + + "no confident answer, champion was {})", + anchor.name(), champion.name()); + context.setArbitrationInfo("junk-filter-defer-no-info"); + return List.of(new EncodingResult(anchor, context.getTopConfidenceFor(anchor))); + } LOG.trace("junk-filter -> {} (argmax z={})", champion.name(), String.format(java.util.Locale.ROOT, "%.3f", championZ)); @@ -238,6 +274,56 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { return List.of(new EncodingResult(champion, confidence)); } + /** + * True if some detector produced a confident non-declarative signal: any + * STRUCTURAL result (byte-grammar proof), or any STATISTICAL result above + * {@link #NO_INFO_CONFIDENCE}. When false, the statistical layer has + * effectively abstained (only its "I don't know" fallback), so a + * declaration should be trusted over a quality argmax. + */ + private static boolean hasConfidentNonDeclarative(EncodingDetectorContext context) { + for (EncodingDetectorContext.Result r : context.getResults()) { + for (EncodingResult er : r.getEncodingResults()) { + EncodingResult.ResultType t = er.getResultType(); + if (t == EncodingResult.ResultType.STRUCTURAL) { + return true; + } + if (t == EncodingResult.ResultType.STATISTICAL + && er.getConfidence() > NO_INFO_CONFIDENCE) { + return true; + } + } + } + return false; + } + + /** + * Highest-scoring DECLARATIVE/STRUCTURAL candidate present in the scored + * pool, or {@code null} if none. This is the higher-evidence "anchor" the + * junk-filter defers to when the statistical layer gives no confident answer + * (see {@link #hasConfidentNonDeclarative}). + */ + private static Charset bestAnchor(EncodingDetectorContext context, + Map<Charset, Double> scoreByCharset) { + Charset best = null; + double bestZ = Double.NEGATIVE_INFINITY; + for (EncodingDetectorContext.Result r : context.getResults()) { + for (EncodingResult er : r.getEncodingResults()) { + EncodingResult.ResultType t = er.getResultType(); + if (t != EncodingResult.ResultType.DECLARATIVE + && t != EncodingResult.ResultType.STRUCTURAL) { + continue; + } + Double z = scoreByCharset.get(er.getCharset()); + if (z != null && z > bestZ) { + bestZ = z; + best = er.getCharset(); + } + } + } + return best; + } + /** * Return the first DECLARATIVE charset whose decoded output equals at * least one other candidate's, or {@code null}. diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin index 5a439399ce..c5a88fa093 100644 Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java index 6d1f553414..76cf6ce02c 100644 --- a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/TraceJunkFilter.java @@ -83,6 +83,8 @@ public final class TraceJunkFilter { boolean sumDiff = false; boolean skipSymbols = false; int headBytes = 0; + boolean contentCleaner = false; + boolean showBuckets = false; for (int i = 0; i < args.length; i++) { switch (args[i]) { case "--sum-diff": @@ -94,6 +96,12 @@ public final class TraceJunkFilter { case "--head-bytes": headBytes = Integer.parseInt(args[++i]); break; + case "--content-cleaner": + contentCleaner = true; + break; + case "--buckets": + showBuckets = true; + break; case "--file": files.add(resolvePath(args[++i])); break; @@ -151,7 +159,8 @@ public final class TraceJunkFilter { traceOne(file, detector, moji, fixedCharsets, sampleLen, showFeatures, showScriptDist, showPerScriptRun, entityModes, autoCandidates, showMojibuster, - sumDiff, skipSymbols, headBytes); + sumDiff, skipSymbols, headBytes, contentCleaner, + showBuckets); } } @@ -172,7 +181,8 @@ public final class TraceJunkFilter { boolean entityModes, boolean autoCandidates, boolean showMojibuster, boolean sumDiff, boolean skipSymbols, - int headBytes) + int headBytes, boolean contentCleaner, + boolean showBuckets) throws IOException { byte[] all = Files.readAllBytes(file); int limit = headBytes > 0 ? Math.min(headBytes, all.length) : READ_LIMIT; @@ -227,8 +237,13 @@ public final class TraceJunkFilter { Map<String, String> decoded = new LinkedHashMap<>(); Map<String, TextQualityScore> scores = new LinkedHashMap<>(); for (Charset cs : charsets) { - String s = JunkFilterEncodingDetector.expandHtmlEntities( - new String(forDecode, cs)); + // --content-cleaner replicates the live chain exactly (decode the + // BOM-stripped probe, then HtmlContentCleaner.clean); the default + // path is the byte-strip diagnostic, which scores DIFFERENT text. + String s = contentCleaner + ? HtmlContentCleaner.clean(new String(bytes, cs)) + : JunkFilterEncodingDetector.expandHtmlEntities( + new String(forDecode, cs)); decoded.put(cs.name(), s); scores.put(cs.name(), detector.score(s)); } @@ -350,6 +365,11 @@ public final class TraceJunkFilter { } } + if (showBuckets) { + printBucketReport(detector, decoded); + printCommonBucketHistogram(detector, decoded); + } + if (sampleLen > 0) { System.out.println(" decoded samples (first " + sampleLen + " chars):"); for (String cs : decoded.keySet()) { @@ -473,6 +493,144 @@ public final class TraceJunkFilter { } } + /** Per-bucket z1 breakdown — counts, raw means, calibration (mu,sigma), + * calibrated bucket-z, and the (count * z) contribution to the doc-z1. + * Probes the (B) "monoscript-collapse" hypothesis: which bucket(s) on + * which decode drag the count-weighted aggregate? */ + private static void printBucketReport(JunkDetector det, + Map<String, String> decoded) { + System.out.println(" per-bucket z1 breakdown:"); + for (Map.Entry<String, String> ent : decoded.entrySet()) { + String cs = ent.getKey(); + String text = java.text.Normalizer.normalize( + ent.getValue(), java.text.Normalizer.Form.NFC); + int[] cps = text.codePoints().toArray(); + Map<String, double[]> buckets = + JunkDetector.bucketSumsAndCounts( + cps, det.f1TablesByScriptView()); + // Aggregate doc z1 ourselves to verify against scoreWithFeatureComponents. + double weightedSum = 0; + long totalCount = 0; + System.out.println(" " + cs + ":"); + System.out.printf(Locale.ROOT, + " %-14s %7s %9s %9s %9s %9s %9s%n", + "bucket", "count", "rawMean", "mu", "sigma", "z", "ct*z"); + // Sort by count descending for readability + List<Map.Entry<String, double[]>> rows = + new ArrayList<>(buckets.entrySet()); + rows.sort((a, b) -> Double.compare(b.getValue()[1], a.getValue()[1])); + for (Map.Entry<String, double[]> e : rows) { + String script = e.getKey(); + long cnt = (long) e.getValue()[1]; + if (cnt == 0) continue; + double rawMean = e.getValue()[0] / cnt; + float[] cal = det.calibrationFor(script); + double mu = cal == null ? Double.NaN : cal[0]; + double sigma = cal == null ? Double.NaN : cal[1]; + double z = cal == null ? Double.NaN : (rawMean - mu) / sigma; + double ctz = z * cnt; + if (!Double.isNaN(z)) { + weightedSum += ctz; + totalCount += cnt; + } + System.out.printf(Locale.ROOT, + " %-14s %7d %+9.3f %+9.3f %9.3f %+9.3f %+9.1f%n", + script, cnt, rawMean, mu, sigma, z, ctz); + } + if (totalCount > 0) { + double docZ1 = weightedSum / totalCount; + System.out.printf(Locale.ROOT, + " %-14s %7d %9s %9s %9s %+9.3f (sum-ctz/N = doc z1)%n", + "TOTAL", totalCount, "", "", "", docZ1); + } + } + } + + /** Histogram of bigrams that land in the COMMON bucket, broken down by + * the Unicode general-category-letter pair (e.g. "Zs·Zs" = space·space). + * Probes whether COMMON is dominated by whitespace HTML residue. */ + private static void printCommonBucketHistogram(JunkDetector det, + Map<String, String> decoded) { + System.out.println(" COMMON bucket bigram categories (top 12 by count):"); + for (Map.Entry<String, String> ent : decoded.entrySet()) { + String cs = ent.getKey(); + String text = java.text.Normalizer.normalize( + ent.getValue(), java.text.Normalizer.Form.NFC); + int[] cps = text.codePoints().toArray(); + BigramTables tCommon = det.f1TablesFor(JunkDetector.COMMON_SCRIPT); + Map<String, long[]> hist = new java.util.HashMap<>(); // catPair -> {count, sumLogP} + long commonBigrams = 0; + double commonSum = 0; + for (int i = 0; i + 1 < cps.length; i++) { + int a = cps[i], b = cps[i + 1]; + if (Character.isDigit(a) || Character.isDigit(b)) continue; + String ka = JunkDetector.classKey(a); + String kb = JunkDetector.classKey(b); + if (!JunkDetector.COMMON_SCRIPT.equals(ka) + || !JunkDetector.COMMON_SCRIPT.equals(kb)) { + continue; + } + String pair = catLabel(a) + "·" + catLabel(b); + long[] row = hist.computeIfAbsent(pair, k -> new long[2]); + row[0]++; + if (tCommon != null) { + double lp = JunkDetector.computeF1MeanLogP( + new int[]{a, b}, tCommon); + if (!Double.isNaN(lp)) { + commonSum += lp; + commonBigrams++; + // Encode sum-lp ×1000 in row[1] for an int aggregate. + row[1] += (long) (lp * 1000); + } + } + } + System.out.println(" " + cs + ": total=" + commonBigrams + + " rawMean=" + String.format(Locale.ROOT, "%.3f", + commonBigrams == 0 ? 0 : commonSum / commonBigrams)); + List<Map.Entry<String, long[]>> rows = + new ArrayList<>(hist.entrySet()); + rows.sort((x, y) -> Long.compare(y.getValue()[0], x.getValue()[0])); + int shown = 0; + for (Map.Entry<String, long[]> e : rows) { + if (shown++ >= 12) break; + long cnt = e.getValue()[0]; + double meanLp = cnt == 0 ? 0 + : (e.getValue()[1] / 1000.0) / cnt; + double pct = 100.0 * cnt / Math.max(1, commonBigrams); + System.out.printf(Locale.ROOT, + " %-12s %7d (%5.1f%%) meanLp=%+8.3f%n", + e.getKey(), cnt, pct, meanLp); + } + } + } + + /** Short category label for diagnostic output: Zs=space-sep, Zl=line-sep, + * Po=other-punct, Pd=dash-punct, Ps=open, Pe=close, Sm=math, Sc=currency, + * Sk=modifier-sym, So=other-sym, Cc=control, Cf=format, etc. */ + private static String catLabel(int cp) { + int t = Character.getType(cp); + switch (t) { + case Character.SPACE_SEPARATOR: return "Zs"; + case Character.LINE_SEPARATOR: return "Zl"; + case Character.PARAGRAPH_SEPARATOR: return "Zp"; + case Character.CONTROL: return "Cc"; + case Character.FORMAT: return "Cf"; + case Character.CONNECTOR_PUNCTUATION: return "Pc"; + case Character.DASH_PUNCTUATION: return "Pd"; + case Character.START_PUNCTUATION: return "Ps"; + case Character.END_PUNCTUATION: return "Pe"; + case Character.INITIAL_QUOTE_PUNCTUATION: return "Pi"; + case Character.FINAL_QUOTE_PUNCTUATION: return "Pf"; + case Character.OTHER_PUNCTUATION: return "Po"; + case Character.MATH_SYMBOL: return "Sm"; + case Character.CURRENCY_SYMBOL: return "Sc"; + case Character.MODIFIER_SYMBOL: return "Sk"; + case Character.OTHER_SYMBOL: return "So"; + default: + return String.format(Locale.ROOT, "?%d", t); + } + } + private static void printPerScriptRun(JunkDetector det, String text) { Map<String, StringBuilder> byScript = new TreeMap<>(); int i = 0;
