(tika) 02/06: junk-detector: corpus diagnostic tools for v7 sizing

tallison Thu, 14 May 2026 11:46:22 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch junk-detector-v6
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 49eb7b48840c95200af8f21ddfe614048a5719f7
Author: tballison <[email protected]>
AuthorDate: Thu May 14 12:02:22 2026 -0400

    junk-detector: corpus diagnostic tools for v7 sizing
    
    Five read-only tools that report training-corpus statistics used to
    inform per-script F1 sizing decisions.  None of these are wired into
    the main trainer or model output; they're invoked manually.
    
    * CountPerScriptBigrams - distinct (cpA,cpB) pair counts per script,
      with coverage curves and per-cutoff model-size estimates for several
      candidate storage schemes (MPHF+val, MPHF+fp+val, open-addressing).
    
    * AnalyzeHanByBlock - bucket HAN bigrams by the Unicode block of each
      codepoint, with ASCII split into digit/letter/punct.  Surfaces the
      CJK Unified / Hiragana / Katakana / ASCII composition of the HAN
      pool.
    
    * ScriptCensus - per-line dominant-script histogram for one or more
      text files (gz or plain).  Used to verify how BuildJunkTrainingData
      routes mixed-script languages like Japanese.
    
    * LineScriptFractions - for each *.train.gz, histogram of the per-line
      target-script-fraction, with cumulative drop percentages at
      thresholds 10/20/30/50/70/90/100.  Identifies scripts whose corpora
      are mostly off-target (e.g. GOTHIC: 40% of lines are <5% Gothic).
    
    * BoundaryBigramAudit - classify every bigram in *.train.gz as
      in-script / script-boundary / foreign-interior / pure-Latin-letter-
      run, and report distinct-pair drop counts under two candidate filter
      rules.
    
    All five build under existing checkstyle; no test fixtures added.
    
    Co-authored-by: Cursor <[email protected]>
---
 .../ml/junkdetect/tools/AnalyzeHanByBlock.java     | 201 +++++++++++++
 .../ml/junkdetect/tools/BoundaryBigramAudit.java   | 170 +++++++++++
 .../ml/junkdetect/tools/CountPerScriptBigrams.java | 326 +++++++++++++++++++++
 .../ml/junkdetect/tools/LineScriptFractions.java   | 155 ++++++++++
 .../tika/ml/junkdetect/tools/ScriptCensus.java     | 165 +++++++++++
 5 files changed, 1017 insertions(+)

diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
new file mode 100644
index 0000000000..08b2aa4eb5
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Diagnostic tool: bucket every bigram in {@code han.train.gz} (or any
+ * specified file) by the {@link Character.UnicodeBlock} of each codepoint,
+ * and report the distribution.
+ *
+ * <p>Goal: determine whether HAN's 224K distinct pairs split cleanly along
+ * block boundaries — e.g. CJK Unified Ideographs vs. Hiragana vs. Katakana —
+ * which would justify routing HAN windows to language-specific sub-models in
+ * the v7 design.
+ *
+ * <p>Usage:
+ * <pre>
+ *   java ... AnalyzeHanByBlock /path/to/junkdetect/han.train.gz
+ * </pre>
+ */
+public final class AnalyzeHanByBlock {
+
+    private AnalyzeHanByBlock() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: AnalyzeHanByBlock <train.gz>");
+            System.exit(1);
+        }
+        Path file = Paths.get(args[0]);
+
+        // (blockA, blockB) -> [totalBigrams, distinctSet via HashMap<Long, 
[count]>]
+        // We use Maps of Maps to keep code simple; HAN is the only file
+        // big enough to matter and fits in heap.
+        Map<String, Map<Long, long[]>> byBlockPair = new HashMap<>();
+        Map<String, long[]> blockPairTotals = new HashMap<>();
+        long totalN = 0;
+
+        try (BufferedReader r = new BufferedReader(
+                new InputStreamReader(
+                        new GZIPInputStream(Files.newInputStream(file)),
+                        StandardCharsets.UTF_8))) {
+            String line;
+            while ((line = r.readLine()) != null) {
+                int prevCp = -1;
+                String prevBlock = null;
+                for (int i = 0; i < line.length(); ) {
+                    int cp = line.codePointAt(i);
+                    i += Character.charCount(cp);
+                    String block = blockShortName(cp);
+                    if (prevCp >= 0) {
+                        String key = prevBlock + "|" + block;
+                        Map<Long, long[]> set = byBlockPair.computeIfAbsent(
+                                key, k -> new HashMap<>(256));
+                        long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL);
+                        long[] c = set.get(packed);
+                        if (c == null) {
+                            set.put(packed, new long[]{1L});
+                        } else {
+                            c[0]++;
+                        }
+                        blockPairTotals.computeIfAbsent(key, k -> new 
long[1])[0]++;
+                        totalN++;
+                    }
+                    prevCp = cp;
+                    prevBlock = block;
+                }
+            }
+        }
+
+        System.out.printf("File: %s%n", file);
+        System.out.printf("Total bigram occurrences: %,d%n%n", totalN);
+
+        // Sort block-pair keys by total occurrences (descending).
+        List<Map.Entry<String, long[]>> sorted = new 
ArrayList<>(blockPairTotals.entrySet());
+        sorted.sort(Comparator.comparingLong(
+                (Map.Entry<String, long[]> e) -> -e.getValue()[0]));
+
+        System.out.printf("%-50s %14s %14s %12s %8s%n",
+                "block_pair", "occurrences", "distinct", "singletons", 
"%total");
+        System.out.println(repeat('-', 105));
+
+        long distinctTotal = 0;
+        long singletonsTotal = 0;
+        for (Map.Entry<String, long[]> e : sorted) {
+            String pair = e.getKey();
+            long n = e.getValue()[0];
+            Map<Long, long[]> set = byBlockPair.get(pair);
+            int distinct = set.size();
+            int singletons = 0;
+            for (long[] c : set.values()) {
+                if (c[0] == 1) singletons++;
+            }
+            distinctTotal += distinct;
+            singletonsTotal += singletons;
+            double pct = 100.0 * n / totalN;
+            if (pct < 0.1 && n < 1000) {
+                continue; // skip tail noise rows
+            }
+            System.out.printf("%-50s %,14d %,14d %,12d %7.2f%%%n",
+                    pair, n, distinct, singletons, pct);
+        }
+        System.out.println(repeat('-', 105));
+        System.out.printf("Total distinct pairs (incl. tail): %,d%n", 
distinctTotal);
+        System.out.printf("Total singletons (incl. tail):     %,d%n", 
singletonsTotal);
+
+        // Roll up by individual block (left side only) to see per-block 
distinct counts.
+        System.out.println();
+        System.out.println("=== Per-leading-block roll-up ===");
+        Map<String, Long> distinctByLeadingBlock = new HashMap<>();
+        Map<String, Long> occByLeadingBlock = new HashMap<>();
+        for (Map.Entry<String, Map<Long, long[]>> e : byBlockPair.entrySet()) {
+            String leading = e.getKey().substring(0, e.getKey().indexOf('|'));
+            distinctByLeadingBlock.merge(leading, (long) e.getValue().size(), 
Long::sum);
+            long sum = 0;
+            for (long[] c : e.getValue().values()) sum += c[0];
+            occByLeadingBlock.merge(leading, sum, Long::sum);
+        }
+        List<Map.Entry<String, Long>> rollup = new 
ArrayList<>(occByLeadingBlock.entrySet());
+        rollup.sort(Comparator.comparingLong(
+                (Map.Entry<String, Long> e) -> -e.getValue()));
+        System.out.printf("%-35s %14s %14s%n",
+                "leading_block", "occurrences", "distinct(rough)");
+        System.out.println(repeat('-', 70));
+        for (Map.Entry<String, Long> e : rollup) {
+            System.out.printf("%-35s %,14d %,14d%n",
+                    e.getKey(), e.getValue(),
+                    distinctByLeadingBlock.get(e.getKey()));
+        }
+    }
+
+    /**
+     * Short-name for the Unicode block containing {@code cp}.  Compresses the
+     * many CJK-related blocks into a handful of human-readable labels.
+     *
+     * <p>Splits ASCII into ASCII_DIGIT / ASCII_LETTER / ASCII_PUNCT so we can
+     * distinguish numerals (which are content-bearing across all scripts) from
+     * English-letter contamination and punctuation.
+     */
+    private static String blockShortName(int cp) {
+        Character.UnicodeBlock b = Character.UnicodeBlock.of(cp);
+        if (b == null) return "UNK";
+
+        String name = b.toString();
+        if (name.equals("BASIC_LATIN")) {
+            if (cp >= '0' && cp <= '9') return "ASCII_DIGIT";
+            if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) return 
"ASCII_LETTER";
+            return "ASCII_PUNCT";
+        }
+        // Compress noisy block names for the report.
+        if (name.startsWith("CJK_UNIFIED_IDEOGRAPHS_EXTENSION")) {
+            return "CJK_EXT_" + name.substring(name.lastIndexOf('_') + 1);
+        }
+        if (name.equals("CJK_UNIFIED_IDEOGRAPHS")) return "CJK_UNIFIED";
+        if (name.equals("CJK_SYMBOLS_AND_PUNCTUATION")) return "CJK_PUNCT";
+        if (name.equals("CJK_COMPATIBILITY_IDEOGRAPHS")) return "CJK_COMPAT";
+        if (name.equals("CJK_COMPATIBILITY_FORMS")) return "CJK_COMPAT_FORMS";
+        if (name.equals("HALFWIDTH_AND_FULLWIDTH_FORMS")) return "HALF_FULL";
+        if (name.equals("HIRAGANA")) return "HIRAGANA";
+        if (name.equals("KATAKANA")) return "KATAKANA";
+        if (name.equals("KATAKANA_PHONETIC_EXTENSIONS")) return "KATAKANA_EXT";
+        if (name.equals("HANGUL_SYLLABLES")) return "HANGUL";
+        if (name.equals("HANGUL_JAMO")) return "HANGUL_JAMO";
+        if (name.equals("HANGUL_COMPATIBILITY_JAMO")) return "HANGUL_JAMO_C";
+        if (name.equals("LATIN_1_SUPPLEMENT")) return "LATIN1";
+        return name;
+    }
+
+    private static String repeat(char c, int n) {
+        char[] buf = new char[n];
+        java.util.Arrays.fill(buf, c);
+        return new String(buf);
+    }
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
new file mode 100644
index 0000000000..f64986b8dd
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * For each {@code *.train.gz} file, classify every adjacent codepoint pair
+ * by its relation to the target script S (= file's script).  Categories:
+ *
+ * <ul>
+ *   <li>IN_S_INTERIOR — both codepoints are in S or in COMMON/INHERITED
+ *   <li>S_BOUNDARY    — exactly one codepoint is in S-or-COMMON, the other
+ *       is a non-S script
+ *   <li>FOREIGN_INTERIOR — both codepoints are in some non-S script
+ *       (possibly different scripts).  Under the proposed generalized
+ *       boundary rule, these are the bigrams to drop from S's training.
+ *   <li>ASCII_LETTER_RUN — special subcategory of foreign interior where
+ *       both cps are ASCII A–Z/a–z; this is the English-run case.
+ * </ul>
+ *
+ * <p>Reports occurrence counts, distinct-pair counts, and singleton counts
+ * for each category, plus the implied model-size impact of dropping
+ * FOREIGN_INTERIOR (or just ASCII_LETTER_RUN) under {@code min_count>=1}
+ * and {@code min_count>=3}.
+ */
+public final class BoundaryBigramAudit {
+
+    private BoundaryBigramAudit() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: BoundaryBigramAudit <dataDir>");
+            System.exit(1);
+        }
+        Path dataDir = Paths.get(args[0]);
+        Path[] files;
+        try (Stream<Path> s = Files.list(dataDir)) {
+            files = s.filter(p -> 
p.getFileName().toString().endsWith(".train.gz"))
+                    .sorted().toArray(Path[]::new);
+        }
+
+        System.out.printf("%-22s %14s %14s %14s %14s %12s | %14s %14s%n",
+                "script", "in_S_occ", "boundary_occ", "foreign_occ",
+                "ascii_run_occ", "total_occ",
+                "drop_foreign_dist", "drop_asciirun_dist");
+        System.out.println(repeat('-', 165));
+
+        for (Path file : files) {
+            String fname = file.getFileName().toString();
+            String name = fname.substring(0, fname.length() - 
".train.gz".length())
+                    .toUpperCase();
+            Character.UnicodeScript target;
+            try {
+                target = Character.UnicodeScript.valueOf(name);
+            } catch (IllegalArgumentException e) {
+                continue;
+            }
+
+            long inS = 0, boundary = 0, foreign = 0, asciiRun = 0;
+            HashMap<Long, long[]> distinctAll = new HashMap<>(1 << 16);
+            HashMap<Long, long[]> distinctKeptUnderForeignDrop = new 
HashMap<>(1 << 16);
+            HashMap<Long, long[]> distinctKeptUnderAsciiDrop = new HashMap<>(1 
<< 16);
+
+            try (BufferedReader r = new BufferedReader(
+                    new InputStreamReader(
+                            new GZIPInputStream(Files.newInputStream(file)),
+                            StandardCharsets.UTF_8))) {
+                String line;
+                while ((line = r.readLine()) != null) {
+                    int prevCp = -1;
+                    for (int i = 0; i < line.length(); ) {
+                        int cp = line.codePointAt(i);
+                        i += Character.charCount(cp);
+                        if (prevCp >= 0) {
+                            boolean aInS = inScriptOrCommon(prevCp, target);
+                            boolean bInS = inScriptOrCommon(cp, target);
+                            boolean aLetter = isLatinLetter(prevCp);
+                            boolean bLetter = isLatinLetter(cp);
+
+                            long packed = ((long) prevCp << 24) | (cp & 
0xFFFFFFL);
+                            increment(distinctAll, packed);
+
+                            if (aInS && bInS) {
+                                inS++;
+                                increment(distinctKeptUnderForeignDrop, 
packed);
+                                increment(distinctKeptUnderAsciiDrop, packed);
+                            } else if (aInS != bInS) {
+                                boundary++;
+                                increment(distinctKeptUnderForeignDrop, 
packed);
+                                increment(distinctKeptUnderAsciiDrop, packed);
+                            } else {
+                                // both foreign (neither in S nor COMMON)
+                                foreign++;
+                                if (aLetter && bLetter) {
+                                    asciiRun++;
+                                } else {
+                                    // foreign interior but not pure ASCII 
letters:
+                                    // we'd keep this under the "ASCII-letter 
only" rule.
+                                    increment(distinctKeptUnderAsciiDrop, 
packed);
+                                }
+                            }
+                        }
+                        prevCp = cp;
+                    }
+                }
+            }
+
+            long total = inS + boundary + foreign;
+            int distAll = distinctAll.size();
+            int distForeignDrop = distinctKeptUnderForeignDrop.size();
+            int distAsciiDrop = distinctKeptUnderAsciiDrop.size();
+
+            System.out.printf("%-22s %,14d %,14d %,14d %,14d %,12d | %,14d 
%,14d%n",
+                    name.toLowerCase(), inS, boundary, foreign, asciiRun, 
total,
+                    distAll - distForeignDrop, distAll - distAsciiDrop);
+        }
+    }
+
+    private static boolean inScriptOrCommon(int cp, Character.UnicodeScript 
target) {
+        Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+        return s == target
+                || s == Character.UnicodeScript.COMMON
+                || s == Character.UnicodeScript.INHERITED;
+    }
+
+    private static boolean isLatinLetter(int cp) {
+        return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')
+                || (cp >= 0xFF21 && cp <= 0xFF3A) // fullwidth A-Z
+                || (cp >= 0xFF41 && cp <= 0xFF5A); // fullwidth a-z
+    }
+
+    private static void increment(HashMap<Long, long[]> map, long key) {
+        long[] c = map.get(key);
+        if (c == null) {
+            map.put(key, new long[]{1L});
+        } else {
+            c[0]++;
+        }
+    }
+
+    private static String repeat(char c, int n) {
+        char[] b = new char[n];
+        java.util.Arrays.fill(b, c);
+        return new String(b);
+    }
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
new file mode 100644
index 0000000000..b287012ddc
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Stream;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Diagnostic tool for sizing a per-script F1 bigram store (v7 design).
+ *
+ * <p>Walks every {@code *.train.gz} in {@code dataDir}, treating each file as
+ * one script's corpus.  Counts (cpA, cpB) codepoint-pair frequencies and
+ * reports, per script:
+ *
+ * <ul>
+ *   <li>total bigram occurrences (N)
+ *   <li>distinct pair count (U)
+ *   <li>singletons — pairs seen exactly once (these are usually the
+ *       worst candidates to keep; they often reflect OCR noise / rare
+ *       proper nouns and inflate U without helping discrimination)
+ *   <li>"effective" pair count = pairs seen at least {@code MIN_COUNT} times
+ *   <li>coverage curve: how many of the top-N most-frequent pairs are needed
+ *       to cover {x = 50, 75, 90, 95, 99, 99.9}% of all bigram occurrences
+ *   <li>estimated v7 model size for several candidate cutoffs, assuming
+ *       2.25 bytes/pair (MPHF + 8-bit fingerprint + 8-bit value)
+ *       and 1.3 bytes/pair (MPHF + 8-bit value, no fingerprint)
+ * </ul>
+ *
+ * <p>Usage:
+ * <pre>
+ *   mvn -pl tika-ml/tika-ml-junkdetect exec:java \
+ *       
-Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.CountPerScriptBigrams \
+ *       -Dexec.args="/path/to/junkdetect"
+ * </pre>
+ *
+ * <p>No model output; this is read-only telemetry to inform the v7 sizing
+ * decision (see {@code 20260514-junk-retrain-v6.md}).
+ */
+public final class CountPerScriptBigrams {
+
+    private static final int[] COVERAGE_PCT = {50, 75, 90, 95, 99};
+    private static final double[] COVERAGE_FRAC_HI = {0.999};
+
+    /** Cutoffs reported in the size-estimate table. */
+    private static final int[] MIN_COUNT_CUTOFFS = {1, 2, 3, 5, 10};
+
+    /** Bytes per retained pair for each candidate storage scheme. */
+    private static final double[] BYTES_PER_PAIR_SCHEMES = {1.3, 2.25, 6.25};
+    private static final String[] SCHEME_NAMES = {
+            "MPHF+val(1.3B)", "MPHF+fp+val(2.25B)", "open-addr+key(6.25B)"};
+
+    private CountPerScriptBigrams() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println(
+                    "Usage: CountPerScriptBigrams <dataDir> 
[topK-per-script]");
+            System.exit(1);
+        }
+        Path dataDir = Paths.get(args[0]);
+        int topK = args.length >= 2 ? Integer.parseInt(args[1]) : 0;
+
+        List<Path> trainFiles = new ArrayList<>();
+        try (Stream<Path> s = Files.list(dataDir)) {
+            s.filter(p -> p.getFileName().toString().endsWith(".train.gz"))
+             .sorted()
+             .forEach(trainFiles::add);
+        }
+        if (trainFiles.isEmpty()) {
+            System.err.println("ERROR: no *.train.gz files in " + dataDir);
+            System.exit(1);
+        }
+
+        System.out.printf("Found %d *.train.gz files in %s%n%n",
+                trainFiles.size(), dataDir);
+        System.out.printf(
+                "%-22s %12s %12s %12s %12s | %s%n",
+                "script", "total_N", "distinct_U", "singletons",
+                "U(>=10)", "coverage: pairs needed for 
[50,75,90,95,99,99.9]%");
+        System.out.println(repeat('-', 140));
+
+        long grandTotalN = 0;
+        long grandTotalU = 0;
+        long grandTotalUge2 = 0;
+        long grandTotalUge10 = 0;
+
+        // Per-script size accumulators for the global-size summary at the end.
+        Map<String, long[]> perScriptStats = new HashMap<>();
+
+        for (Path trainFile : trainFiles) {
+            String fname = trainFile.getFileName().toString();
+            String script = fname.substring(0, fname.length() - 
".train.gz".length())
+                    .toUpperCase();
+
+            HashMap<Long, long[]> pairCounts = new HashMap<>(1 << 16);
+            long totalN = 0;
+            try (BufferedReader r = new BufferedReader(
+                    new InputStreamReader(
+                            new 
GZIPInputStream(Files.newInputStream(trainFile)),
+                            StandardCharsets.UTF_8))) {
+                String line;
+                while ((line = r.readLine()) != null) {
+                    int prevCp = -1;
+                    for (int i = 0; i < line.length(); ) {
+                        int cp = line.codePointAt(i);
+                        i += Character.charCount(cp);
+                        if (prevCp >= 0) {
+                            long key = packPair(prevCp, cp);
+                            long[] c = pairCounts.get(key);
+                            if (c == null) {
+                                pairCounts.put(key, new long[]{1L});
+                            } else {
+                                c[0]++;
+                            }
+                            totalN++;
+                        }
+                        prevCp = cp;
+                    }
+                }
+            }
+
+            int distinctU = pairCounts.size();
+
+            long[] counts = new long[distinctU];
+            int idx = 0;
+            for (long[] c : pairCounts.values()) {
+                counts[idx++] = c[0];
+            }
+            // Sort descending for coverage curve.
+            java.util.Arrays.sort(counts);
+            // Reverse in place.
+            for (int i = 0, j = counts.length - 1; i < j; i++, j--) {
+                long t = counts[i];
+                counts[i] = counts[j];
+                counts[j] = t;
+            }
+
+            int singletons = 0;
+            int uGe2 = 0;
+            int uGe10 = 0;
+            for (long c : counts) {
+                if (c == 1) singletons++;
+                if (c >= 2) uGe2++;
+                if (c >= 10) uGe10++;
+            }
+
+            // Coverage thresholds: minimum k such that sum(counts[0..k-1]) / 
N >= t.
+            int[] coveragePairs = new int[COVERAGE_PCT.length + 
COVERAGE_FRAC_HI.length];
+            double[] thresholds = new double[coveragePairs.length];
+            for (int i = 0; i < COVERAGE_PCT.length; i++) {
+                thresholds[i] = COVERAGE_PCT[i] / 100.0;
+            }
+            for (int i = 0; i < COVERAGE_FRAC_HI.length; i++) {
+                thresholds[COVERAGE_PCT.length + i] = COVERAGE_FRAC_HI[i];
+            }
+            long running = 0;
+            int tIdx = 0;
+            for (int k = 0; k < counts.length && tIdx < thresholds.length; 
k++) {
+                running += counts[k];
+                while (tIdx < thresholds.length
+                        && (double) running / totalN >= thresholds[tIdx]) {
+                    coveragePairs[tIdx++] = k + 1;
+                }
+            }
+            // Fill any unreached thresholds with U (means: never reached, 
took all).
+            for (; tIdx < thresholds.length; tIdx++) {
+                coveragePairs[tIdx] = distinctU;
+            }
+
+            StringBuilder cov = new StringBuilder();
+            for (int i = 0; i < coveragePairs.length; i++) {
+                if (i > 0) cov.append(", ");
+                cov.append(String.format("%,d", coveragePairs[i]));
+            }
+
+            System.out.printf("%-22s %,12d %,12d %,12d %,12d | %s%n",
+                    script.toLowerCase(),
+                    totalN, distinctU, singletons, uGe10,
+                    cov.toString());
+
+            // Per-script size table.
+            if (topK > 0 || true) {
+                long[] sizeStats = new long[
+                        2 + MIN_COUNT_CUTOFFS.length + 
BYTES_PER_PAIR_SCHEMES.length];
+                sizeStats[0] = totalN;
+                sizeStats[1] = distinctU;
+                for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+                    int minC = MIN_COUNT_CUTOFFS[i];
+                    int kept = 0;
+                    for (long c : counts) {
+                        if (c >= minC) kept++;
+                        else break;
+                    }
+                    sizeStats[2 + i] = kept;
+                }
+                perScriptStats.put(script.toLowerCase(), sizeStats);
+            }
+
+            // Per-script top-K dump if requested.
+            if (topK > 0) {
+                System.out.printf("    top %d pairs in %s:%n", topK, 
script.toLowerCase());
+                List<Map.Entry<Long, long[]>> sorted = new 
ArrayList<>(pairCounts.entrySet());
+                sorted.sort((a, b) -> Long.compare(b.getValue()[0], 
a.getValue()[0]));
+                for (int i = 0; i < Math.min(topK, sorted.size()); i++) {
+                    Map.Entry<Long, long[]> e = sorted.get(i);
+                    long k = e.getKey();
+                    int cpA = (int) (k >>> 24);
+                    int cpB = (int) (k & 0xFFFFFFL);
+                    System.out.printf("      U+%04X U+%04X  (%c %c)  %,d%n",
+                            cpA, cpB,
+                            safePrint(cpA), safePrint(cpB),
+                            e.getValue()[0]);
+                }
+            }
+
+            grandTotalN += totalN;
+            grandTotalU += distinctU;
+            grandTotalUge2 += uGe2;
+            grandTotalUge10 += uGe10;
+        }
+
+        System.out.println(repeat('-', 140));
+        System.out.printf("%-22s %,12d %,12d %12s %,12d%n%n",
+                "TOTAL", grandTotalN, grandTotalU,
+                "-", grandTotalUge10);
+
+        // ------------------------------------------------------------------
+        // Cutoff vs. model-size summary
+        // ------------------------------------------------------------------
+        System.out.println("=== Model-size estimates by min-count cutoff and 
storage scheme ===");
+        System.out.println("(sum of retained pairs across all scripts × 
bytes-per-pair)");
+        System.out.println();
+        System.out.printf("%-12s", "cutoff");
+        for (String name : SCHEME_NAMES) {
+            System.out.printf(" %20s", name);
+        }
+        System.out.printf(" %20s%n", "retained_pairs");
+        System.out.println(repeat('-', 12 + (SCHEME_NAMES.length + 1) * 21));
+
+        for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+            long retained = 0;
+            for (long[] stats : perScriptStats.values()) {
+                retained += stats[2 + i];
+            }
+            System.out.printf("min_count>=%-2d", MIN_COUNT_CUTOFFS[i]);
+            for (double bpp : BYTES_PER_PAIR_SCHEMES) {
+                double bytes = retained * bpp;
+                System.out.printf(" %18s   ", humanBytes(bytes));
+            }
+            System.out.printf(" %,20d%n", retained);
+        }
+
+        System.out.println();
+        System.out.println("Per-script pair counts retained at each cutoff:");
+        System.out.printf("%-22s", "script");
+        for (int c : MIN_COUNT_CUTOFFS) {
+            System.out.printf(" %12s", ">=" + c);
+        }
+        System.out.println();
+        List<Map.Entry<String, long[]>> sortedScripts =
+                new ArrayList<>(perScriptStats.entrySet());
+        sortedScripts.sort(Comparator.comparingLong(
+                (Map.Entry<String, long[]> e) -> -e.getValue()[1]));
+        for (Map.Entry<String, long[]> e : sortedScripts) {
+            System.out.printf("%-22s", e.getKey());
+            for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) {
+                System.out.printf(" %,12d", e.getValue()[2 + i]);
+            }
+            System.out.println();
+        }
+    }
+
+    /** Pack two codepoints (each up to 21 bits) into a single long. */
+    private static long packPair(int cpA, int cpB) {
+        return ((long) cpA << 24) | (cpB & 0xFFFFFFL);
+    }
+
+    private static char safePrint(int cp) {
+        if (cp < 0x20 || cp == 0x7F || !Character.isDefined(cp)) {
+            return '.';
+        }
+        if (Character.charCount(cp) != 1) {
+            return '?';
+        }
+        return (char) cp;
+    }
+
+    private static String repeat(char c, int n) {
+        char[] buf = new char[n];
+        java.util.Arrays.fill(buf, c);
+        return new String(buf);
+    }
+
+    private static String humanBytes(double bytes) {
+        if (bytes < 1024) return String.format("%.0f B", bytes);
+        if (bytes < 1024 * 1024) return String.format("%.1f KB", bytes / 
1024.0);
+        if (bytes < 1024L * 1024 * 1024) return String.format("%.2f MB", bytes 
/ (1024.0 * 1024));
+        return String.format("%.2f GB", bytes / (1024.0 * 1024 * 1024));
+    }
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
new file mode 100644
index 0000000000..bcda57c9f7
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * For each {@code *.train.gz} file in a directory, compute per-line statistics
+ * of "target-script fraction" — i.e. the fraction of codepoints in each line
+ * that belong to the script the file is supposed to represent.
+ *
+ * <p>Reports a histogram across the buckets
+ * [0, 5, 10, 20, 30, 50, 70, 90, 100]% so we can pick a per-script keep
+ * threshold (e.g. "drop lines with &lt;20% HAN codepoints").  Also reports
+ * what fraction of total bytes / lines would be dropped at each threshold.
+ *
+ * <p>Each {@code {script}.train.gz} maps to a {@link Character.UnicodeScript};
+ * the file basename is uppercased.  Special-case handling routes a few
+ * project-internal script names (e.g. HAN includes HALF_FULL ideographic
+ * forms) when desired.
+ *
+ * <p>Usage:
+ * <pre>
+ *   java LineScriptFractions &lt;dataDir&gt; [thresholds]
+ * </pre>
+ */
+public final class LineScriptFractions {
+
+    private static final int[] BUCKETS = {0, 5, 10, 20, 30, 50, 70, 90, 100};
+
+    private LineScriptFractions() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: LineScriptFractions <dataDir>");
+            System.exit(1);
+        }
+        Path dataDir = Paths.get(args[0]);
+        Path[] files;
+        try (var s = Files.list(dataDir)) {
+            files = s.filter(p -> 
p.getFileName().toString().endsWith(".train.gz"))
+                    .sorted().toArray(Path[]::new);
+        }
+        if (files.length == 0) {
+            System.err.println("No *.train.gz files in " + dataDir);
+            System.exit(1);
+        }
+
+        System.out.printf("%-20s %10s %10s | %s%n",
+                "script", "lines", "<5%",
+                "lines at target-frac threshold (cumulative dropped %)");
+        System.out.println("                                            "
+                + " <10%   <20%   <30%   <50%   <70%   <90%  <100%");
+        System.out.println(repeat('-', 110));
+
+        for (Path file : files) {
+            String fname = file.getFileName().toString();
+            String name = fname.substring(0, fname.length() - 
".train.gz".length())
+                    .toUpperCase();
+            Character.UnicodeScript target = mapScript(name);
+            if (target == null) {
+                System.out.printf("%-20s  (no UnicodeScript mapping for 
'%s')%n", name, name);
+                continue;
+            }
+
+            long lines = 0;
+            long[] bucketCounts = new long[BUCKETS.length];
+            try (BufferedReader r = new BufferedReader(
+                    new InputStreamReader(
+                            new GZIPInputStream(Files.newInputStream(file)),
+                            StandardCharsets.UTF_8))) {
+                String line;
+                while ((line = r.readLine()) != null) {
+                    lines++;
+                    int total = 0;
+                    int matching = 0;
+                    for (int i = 0; i < line.length(); ) {
+                        int cp = line.codePointAt(i);
+                        i += Character.charCount(cp);
+                        Character.UnicodeScript s = 
Character.UnicodeScript.of(cp);
+                        if (s == Character.UnicodeScript.COMMON
+                                || s == Character.UnicodeScript.INHERITED
+                                || s == Character.UnicodeScript.UNKNOWN) {
+                            // Don't count toward denominator: punctuation,
+                            // spaces, diacritics are script-neutral.
+                            continue;
+                        }
+                        total++;
+                        if (s == target) matching++;
+                    }
+                    double pct = total == 0 ? 0.0 : 100.0 * matching / total;
+                    int b = 0;
+                    while (b < BUCKETS.length - 1 && pct >= BUCKETS[b + 1]) 
b++;
+                    bucketCounts[b]++;
+                }
+            }
+
+            // Convert bucket counts to "cumulative fraction dropped at 
threshold = BUCKETS[i]".
+            StringBuilder sb = new StringBuilder();
+            long cum = 0;
+            // bucketCounts[i] holds lines with pct in [BUCKETS[i], 
BUCKETS[i+1]).
+            // Drop-if-pct<T means drop all bucketCounts[j] with BUCKETS[j+1] 
<= T.
+            // We report drop-fraction for thresholds 10, 20, 30, 50, 70, 90, 
100.
+            int[] thresholds = {10, 20, 30, 50, 70, 90, 100};
+            for (int t : thresholds) {
+                long dropped = 0;
+                for (int j = 0; j < BUCKETS.length; j++) {
+                    int hi = (j == BUCKETS.length - 1) ? 101 : BUCKETS[j + 1];
+                    if (hi <= t) dropped += bucketCounts[j];
+                }
+                double pct = 100.0 * dropped / Math.max(1, lines);
+                sb.append(String.format(" %6.1f", pct));
+            }
+
+            long below5 = bucketCounts[0];
+            System.out.printf("%-20s %,10d %,10d |%s%n",
+                    name.toLowerCase(), lines, below5, sb.toString());
+        }
+    }
+
+    private static Character.UnicodeScript mapScript(String name) {
+        try {
+            return Character.UnicodeScript.valueOf(name);
+        } catch (IllegalArgumentException e) {
+            return null;
+        }
+    }
+
+    private static String repeat(char c, int n) {
+        char[] b = new char[n];
+        java.util.Arrays.fill(b, c);
+        return new String(b);
+    }
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
new file mode 100644
index 0000000000..b384d5f4c5
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Codepoint-level script census of one or more text files.  For each input
+ * file, reports the percentage of codepoints in each {@link
+ * Character.UnicodeScript}, optionally per-line script-mix histograms.
+ *
+ * <p>Useful to verify whether {@code BuildJunkTrainingData} is bucketing
+ * languages correctly: e.g. Japanese is usually a mix of HIRAGANA, KATAKANA
+ * and HAN; if {@code jpn} ends up in {@code han.train.gz} we want to know
+ * what fraction of its codepoints are actually Han ideographs vs. kana.
+ *
+ * <p>Usage:
+ * <pre>
+ *   java ScriptCensus &lt;file&gt; [file ...]   # supports .gz and plain text
+ * </pre>
+ */
+public final class ScriptCensus {
+
+    /** Max lines to sample per file (set high for full pass). */
+    private static final int MAX_LINES = 200_000;
+
+    private ScriptCensus() {}
+
+    public static void main(String[] args) throws IOException {
+        if (args.length < 1) {
+            System.err.println("Usage: ScriptCensus <file> [file ...]");
+            System.exit(1);
+        }
+        for (String arg : args) {
+            Path f = Paths.get(arg);
+            if (!Files.isRegularFile(f)) {
+                System.err.println("Skipping non-file: " + f);
+                continue;
+            }
+            reportOne(f);
+            System.out.println();
+        }
+    }
+
+    private static void reportOne(Path file) throws IOException {
+        Map<String, long[]> scriptCounts = new HashMap<>();
+        // Per-line dominant-script histogram.
+        Map<String, long[]> dominantHistogram = new HashMap<>();
+        long total = 0;
+        long lines = 0;
+        long sampledBytes = 0;
+
+        try (BufferedReader r = open(file)) {
+            String line;
+            while ((line = r.readLine()) != null && lines < MAX_LINES) {
+                lines++;
+                sampledBytes += line.length();
+                // For MADLAD/Wikipedia files the format is "lineNum TAB text";
+                // strip the prefix if present.
+                int tab = line.indexOf('\t');
+                String text = tab >= 0 ? line.substring(tab + 1) : line;
+
+                Map<String, Long> perLine = new HashMap<>();
+                for (int i = 0; i < text.length(); ) {
+                    int cp = text.codePointAt(i);
+                    i += Character.charCount(cp);
+                    Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+                    if (s == Character.UnicodeScript.COMMON
+                            || s == Character.UnicodeScript.INHERITED
+                            || s == Character.UnicodeScript.UNKNOWN) {
+                        continue;
+                    }
+                    String name = s.name();
+                    scriptCounts.computeIfAbsent(name, k -> new long[1])[0]++;
+                    perLine.merge(name, 1L, Long::sum);
+                    total++;
+                }
+                // Identify the dominant script for this line.
+                String dom = null;
+                long best = -1;
+                for (Map.Entry<String, Long> e : perLine.entrySet()) {
+                    if (e.getValue() > best) {
+                        best = e.getValue();
+                        dom = e.getKey();
+                    }
+                }
+                if (dom != null) {
+                    dominantHistogram.computeIfAbsent(dom, k -> new 
long[1])[0]++;
+                }
+            }
+        }
+
+        System.out.printf("File: %s%n", file);
+        System.out.printf("  lines sampled: %,d   total codepoints (excl. 
COMMON/INHERITED): %,d%n%n",
+                lines, total);
+
+        if (total == 0) {
+            System.out.println("  (empty / no scripted codepoints)");
+            return;
+        }
+
+        System.out.println("  Codepoint distribution by script:");
+        List<Map.Entry<String, long[]>> sorted = new 
ArrayList<>(scriptCounts.entrySet());
+        sorted.sort(Comparator.comparingLong((Map.Entry<String, long[]> e) -> 
-e.getValue()[0]));
+        long cumulative = 0;
+        for (Map.Entry<String, long[]> e : sorted) {
+            long c = e.getValue()[0];
+            cumulative += c;
+            double pct = 100.0 * c / total;
+            double cumPct = 100.0 * cumulative / total;
+            if (pct < 0.01 && c < 100) continue;
+            System.out.printf("    %-22s %,14d  %6.2f%%  (cum %6.2f%%)%n",
+                    e.getKey(), c, pct, cumPct);
+        }
+
+        System.out.println();
+        System.out.println("  Per-line dominant-script histogram:");
+        List<Map.Entry<String, long[]>> dom = new 
ArrayList<>(dominantHistogram.entrySet());
+        dom.sort(Comparator.comparingLong((Map.Entry<String, long[]> e) -> 
-e.getValue()[0]));
+        long domTotal = 0;
+        for (long[] v : dominantHistogram.values()) domTotal += v[0];
+        for (Map.Entry<String, long[]> e : dom) {
+            long c = e.getValue()[0];
+            double pct = 100.0 * c / domTotal;
+            if (pct < 0.05) continue;
+            System.out.printf("    %-22s %,12d  %6.2f%% of lines%n",
+                    e.getKey(), c, pct);
+        }
+    }
+
+    private static BufferedReader open(Path path) throws IOException {
+        if (path.getFileName().toString().endsWith(".gz")) {
+            return new BufferedReader(new InputStreamReader(
+                    new GZIPInputStream(Files.newInputStream(path)),
+                    StandardCharsets.UTF_8));
+        }
+        return Files.newBufferedReader(path, StandardCharsets.UTF_8);
+    }
+}

(tika) 02/06: junk-detector: corpus diagnostic tools for v7 sizing

Reply via email to