This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch junk-detector-v6 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 49eb7b48840c95200af8f21ddfe614048a5719f7 Author: tballison <[email protected]> AuthorDate: Thu May 14 12:02:22 2026 -0400 junk-detector: corpus diagnostic tools for v7 sizing Five read-only tools that report training-corpus statistics used to inform per-script F1 sizing decisions. None of these are wired into the main trainer or model output; they're invoked manually. * CountPerScriptBigrams - distinct (cpA,cpB) pair counts per script, with coverage curves and per-cutoff model-size estimates for several candidate storage schemes (MPHF+val, MPHF+fp+val, open-addressing). * AnalyzeHanByBlock - bucket HAN bigrams by the Unicode block of each codepoint, with ASCII split into digit/letter/punct. Surfaces the CJK Unified / Hiragana / Katakana / ASCII composition of the HAN pool. * ScriptCensus - per-line dominant-script histogram for one or more text files (gz or plain). Used to verify how BuildJunkTrainingData routes mixed-script languages like Japanese. * LineScriptFractions - for each *.train.gz, histogram of the per-line target-script-fraction, with cumulative drop percentages at thresholds 10/20/30/50/70/90/100. Identifies scripts whose corpora are mostly off-target (e.g. GOTHIC: 40% of lines are <5% Gothic). * BoundaryBigramAudit - classify every bigram in *.train.gz as in-script / script-boundary / foreign-interior / pure-Latin-letter- run, and report distinct-pair drop counts under two candidate filter rules. All five build under existing checkstyle; no test fixtures added. Co-authored-by: Cursor <[email protected]> --- .../ml/junkdetect/tools/AnalyzeHanByBlock.java | 201 +++++++++++++ .../ml/junkdetect/tools/BoundaryBigramAudit.java | 170 +++++++++++ .../ml/junkdetect/tools/CountPerScriptBigrams.java | 326 +++++++++++++++++++++ .../ml/junkdetect/tools/LineScriptFractions.java | 155 ++++++++++ .../tika/ml/junkdetect/tools/ScriptCensus.java | 165 +++++++++++ 5 files changed, 1017 insertions(+) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java new file mode 100644 index 0000000000..08b2aa4eb5 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/AnalyzeHanByBlock.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * Diagnostic tool: bucket every bigram in {@code han.train.gz} (or any + * specified file) by the {@link Character.UnicodeBlock} of each codepoint, + * and report the distribution. + * + * <p>Goal: determine whether HAN's 224K distinct pairs split cleanly along + * block boundaries — e.g. CJK Unified Ideographs vs. Hiragana vs. Katakana — + * which would justify routing HAN windows to language-specific sub-models in + * the v7 design. + * + * <p>Usage: + * <pre> + * java ... AnalyzeHanByBlock /path/to/junkdetect/han.train.gz + * </pre> + */ +public final class AnalyzeHanByBlock { + + private AnalyzeHanByBlock() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: AnalyzeHanByBlock <train.gz>"); + System.exit(1); + } + Path file = Paths.get(args[0]); + + // (blockA, blockB) -> [totalBigrams, distinctSet via HashMap<Long, [count]>] + // We use Maps of Maps to keep code simple; HAN is the only file + // big enough to matter and fits in heap. + Map<String, Map<Long, long[]>> byBlockPair = new HashMap<>(); + Map<String, long[]> blockPairTotals = new HashMap<>(); + long totalN = 0; + + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(file)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + int prevCp = -1; + String prevBlock = null; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + String block = blockShortName(cp); + if (prevCp >= 0) { + String key = prevBlock + "|" + block; + Map<Long, long[]> set = byBlockPair.computeIfAbsent( + key, k -> new HashMap<>(256)); + long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL); + long[] c = set.get(packed); + if (c == null) { + set.put(packed, new long[]{1L}); + } else { + c[0]++; + } + blockPairTotals.computeIfAbsent(key, k -> new long[1])[0]++; + totalN++; + } + prevCp = cp; + prevBlock = block; + } + } + } + + System.out.printf("File: %s%n", file); + System.out.printf("Total bigram occurrences: %,d%n%n", totalN); + + // Sort block-pair keys by total occurrences (descending). + List<Map.Entry<String, long[]>> sorted = new ArrayList<>(blockPairTotals.entrySet()); + sorted.sort(Comparator.comparingLong( + (Map.Entry<String, long[]> e) -> -e.getValue()[0])); + + System.out.printf("%-50s %14s %14s %12s %8s%n", + "block_pair", "occurrences", "distinct", "singletons", "%total"); + System.out.println(repeat('-', 105)); + + long distinctTotal = 0; + long singletonsTotal = 0; + for (Map.Entry<String, long[]> e : sorted) { + String pair = e.getKey(); + long n = e.getValue()[0]; + Map<Long, long[]> set = byBlockPair.get(pair); + int distinct = set.size(); + int singletons = 0; + for (long[] c : set.values()) { + if (c[0] == 1) singletons++; + } + distinctTotal += distinct; + singletonsTotal += singletons; + double pct = 100.0 * n / totalN; + if (pct < 0.1 && n < 1000) { + continue; // skip tail noise rows + } + System.out.printf("%-50s %,14d %,14d %,12d %7.2f%%%n", + pair, n, distinct, singletons, pct); + } + System.out.println(repeat('-', 105)); + System.out.printf("Total distinct pairs (incl. tail): %,d%n", distinctTotal); + System.out.printf("Total singletons (incl. tail): %,d%n", singletonsTotal); + + // Roll up by individual block (left side only) to see per-block distinct counts. + System.out.println(); + System.out.println("=== Per-leading-block roll-up ==="); + Map<String, Long> distinctByLeadingBlock = new HashMap<>(); + Map<String, Long> occByLeadingBlock = new HashMap<>(); + for (Map.Entry<String, Map<Long, long[]>> e : byBlockPair.entrySet()) { + String leading = e.getKey().substring(0, e.getKey().indexOf('|')); + distinctByLeadingBlock.merge(leading, (long) e.getValue().size(), Long::sum); + long sum = 0; + for (long[] c : e.getValue().values()) sum += c[0]; + occByLeadingBlock.merge(leading, sum, Long::sum); + } + List<Map.Entry<String, Long>> rollup = new ArrayList<>(occByLeadingBlock.entrySet()); + rollup.sort(Comparator.comparingLong( + (Map.Entry<String, Long> e) -> -e.getValue())); + System.out.printf("%-35s %14s %14s%n", + "leading_block", "occurrences", "distinct(rough)"); + System.out.println(repeat('-', 70)); + for (Map.Entry<String, Long> e : rollup) { + System.out.printf("%-35s %,14d %,14d%n", + e.getKey(), e.getValue(), + distinctByLeadingBlock.get(e.getKey())); + } + } + + /** + * Short-name for the Unicode block containing {@code cp}. Compresses the + * many CJK-related blocks into a handful of human-readable labels. + * + * <p>Splits ASCII into ASCII_DIGIT / ASCII_LETTER / ASCII_PUNCT so we can + * distinguish numerals (which are content-bearing across all scripts) from + * English-letter contamination and punctuation. + */ + private static String blockShortName(int cp) { + Character.UnicodeBlock b = Character.UnicodeBlock.of(cp); + if (b == null) return "UNK"; + + String name = b.toString(); + if (name.equals("BASIC_LATIN")) { + if (cp >= '0' && cp <= '9') return "ASCII_DIGIT"; + if ((cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z')) return "ASCII_LETTER"; + return "ASCII_PUNCT"; + } + // Compress noisy block names for the report. + if (name.startsWith("CJK_UNIFIED_IDEOGRAPHS_EXTENSION")) { + return "CJK_EXT_" + name.substring(name.lastIndexOf('_') + 1); + } + if (name.equals("CJK_UNIFIED_IDEOGRAPHS")) return "CJK_UNIFIED"; + if (name.equals("CJK_SYMBOLS_AND_PUNCTUATION")) return "CJK_PUNCT"; + if (name.equals("CJK_COMPATIBILITY_IDEOGRAPHS")) return "CJK_COMPAT"; + if (name.equals("CJK_COMPATIBILITY_FORMS")) return "CJK_COMPAT_FORMS"; + if (name.equals("HALFWIDTH_AND_FULLWIDTH_FORMS")) return "HALF_FULL"; + if (name.equals("HIRAGANA")) return "HIRAGANA"; + if (name.equals("KATAKANA")) return "KATAKANA"; + if (name.equals("KATAKANA_PHONETIC_EXTENSIONS")) return "KATAKANA_EXT"; + if (name.equals("HANGUL_SYLLABLES")) return "HANGUL"; + if (name.equals("HANGUL_JAMO")) return "HANGUL_JAMO"; + if (name.equals("HANGUL_COMPATIBILITY_JAMO")) return "HANGUL_JAMO_C"; + if (name.equals("LATIN_1_SUPPLEMENT")) return "LATIN1"; + return name; + } + + private static String repeat(char c, int n) { + char[] buf = new char[n]; + java.util.Arrays.fill(buf, c); + return new String(buf); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java new file mode 100644 index 0000000000..f64986b8dd --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BoundaryBigramAudit.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +/** + * For each {@code *.train.gz} file, classify every adjacent codepoint pair + * by its relation to the target script S (= file's script). Categories: + * + * <ul> + * <li>IN_S_INTERIOR — both codepoints are in S or in COMMON/INHERITED + * <li>S_BOUNDARY — exactly one codepoint is in S-or-COMMON, the other + * is a non-S script + * <li>FOREIGN_INTERIOR — both codepoints are in some non-S script + * (possibly different scripts). Under the proposed generalized + * boundary rule, these are the bigrams to drop from S's training. + * <li>ASCII_LETTER_RUN — special subcategory of foreign interior where + * both cps are ASCII A–Z/a–z; this is the English-run case. + * </ul> + * + * <p>Reports occurrence counts, distinct-pair counts, and singleton counts + * for each category, plus the implied model-size impact of dropping + * FOREIGN_INTERIOR (or just ASCII_LETTER_RUN) under {@code min_count>=1} + * and {@code min_count>=3}. + */ +public final class BoundaryBigramAudit { + + private BoundaryBigramAudit() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: BoundaryBigramAudit <dataDir>"); + System.exit(1); + } + Path dataDir = Paths.get(args[0]); + Path[] files; + try (Stream<Path> s = Files.list(dataDir)) { + files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz")) + .sorted().toArray(Path[]::new); + } + + System.out.printf("%-22s %14s %14s %14s %14s %12s | %14s %14s%n", + "script", "in_S_occ", "boundary_occ", "foreign_occ", + "ascii_run_occ", "total_occ", + "drop_foreign_dist", "drop_asciirun_dist"); + System.out.println(repeat('-', 165)); + + for (Path file : files) { + String fname = file.getFileName().toString(); + String name = fname.substring(0, fname.length() - ".train.gz".length()) + .toUpperCase(); + Character.UnicodeScript target; + try { + target = Character.UnicodeScript.valueOf(name); + } catch (IllegalArgumentException e) { + continue; + } + + long inS = 0, boundary = 0, foreign = 0, asciiRun = 0; + HashMap<Long, long[]> distinctAll = new HashMap<>(1 << 16); + HashMap<Long, long[]> distinctKeptUnderForeignDrop = new HashMap<>(1 << 16); + HashMap<Long, long[]> distinctKeptUnderAsciiDrop = new HashMap<>(1 << 16); + + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(file)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + int prevCp = -1; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + if (prevCp >= 0) { + boolean aInS = inScriptOrCommon(prevCp, target); + boolean bInS = inScriptOrCommon(cp, target); + boolean aLetter = isLatinLetter(prevCp); + boolean bLetter = isLatinLetter(cp); + + long packed = ((long) prevCp << 24) | (cp & 0xFFFFFFL); + increment(distinctAll, packed); + + if (aInS && bInS) { + inS++; + increment(distinctKeptUnderForeignDrop, packed); + increment(distinctKeptUnderAsciiDrop, packed); + } else if (aInS != bInS) { + boundary++; + increment(distinctKeptUnderForeignDrop, packed); + increment(distinctKeptUnderAsciiDrop, packed); + } else { + // both foreign (neither in S nor COMMON) + foreign++; + if (aLetter && bLetter) { + asciiRun++; + } else { + // foreign interior but not pure ASCII letters: + // we'd keep this under the "ASCII-letter only" rule. + increment(distinctKeptUnderAsciiDrop, packed); + } + } + } + prevCp = cp; + } + } + } + + long total = inS + boundary + foreign; + int distAll = distinctAll.size(); + int distForeignDrop = distinctKeptUnderForeignDrop.size(); + int distAsciiDrop = distinctKeptUnderAsciiDrop.size(); + + System.out.printf("%-22s %,14d %,14d %,14d %,14d %,12d | %,14d %,14d%n", + name.toLowerCase(), inS, boundary, foreign, asciiRun, total, + distAll - distForeignDrop, distAll - distAsciiDrop); + } + } + + private static boolean inScriptOrCommon(int cp, Character.UnicodeScript target) { + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + return s == target + || s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED; + } + + private static boolean isLatinLetter(int cp) { + return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z') + || (cp >= 0xFF21 && cp <= 0xFF3A) // fullwidth A-Z + || (cp >= 0xFF41 && cp <= 0xFF5A); // fullwidth a-z + } + + private static void increment(HashMap<Long, long[]> map, long key) { + long[] c = map.get(key); + if (c == null) { + map.put(key, new long[]{1L}); + } else { + c[0]++; + } + } + + private static String repeat(char c, int n) { + char[] b = new char[n]; + java.util.Arrays.fill(b, c); + return new String(b); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java new file mode 100644 index 0000000000..b287012ddc --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/CountPerScriptBigrams.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +/** + * Diagnostic tool for sizing a per-script F1 bigram store (v7 design). + * + * <p>Walks every {@code *.train.gz} in {@code dataDir}, treating each file as + * one script's corpus. Counts (cpA, cpB) codepoint-pair frequencies and + * reports, per script: + * + * <ul> + * <li>total bigram occurrences (N) + * <li>distinct pair count (U) + * <li>singletons — pairs seen exactly once (these are usually the + * worst candidates to keep; they often reflect OCR noise / rare + * proper nouns and inflate U without helping discrimination) + * <li>"effective" pair count = pairs seen at least {@code MIN_COUNT} times + * <li>coverage curve: how many of the top-N most-frequent pairs are needed + * to cover {x = 50, 75, 90, 95, 99, 99.9}% of all bigram occurrences + * <li>estimated v7 model size for several candidate cutoffs, assuming + * 2.25 bytes/pair (MPHF + 8-bit fingerprint + 8-bit value) + * and 1.3 bytes/pair (MPHF + 8-bit value, no fingerprint) + * </ul> + * + * <p>Usage: + * <pre> + * mvn -pl tika-ml/tika-ml-junkdetect exec:java \ + * -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.CountPerScriptBigrams \ + * -Dexec.args="/path/to/junkdetect" + * </pre> + * + * <p>No model output; this is read-only telemetry to inform the v7 sizing + * decision (see {@code 20260514-junk-retrain-v6.md}). + */ +public final class CountPerScriptBigrams { + + private static final int[] COVERAGE_PCT = {50, 75, 90, 95, 99}; + private static final double[] COVERAGE_FRAC_HI = {0.999}; + + /** Cutoffs reported in the size-estimate table. */ + private static final int[] MIN_COUNT_CUTOFFS = {1, 2, 3, 5, 10}; + + /** Bytes per retained pair for each candidate storage scheme. */ + private static final double[] BYTES_PER_PAIR_SCHEMES = {1.3, 2.25, 6.25}; + private static final String[] SCHEME_NAMES = { + "MPHF+val(1.3B)", "MPHF+fp+val(2.25B)", "open-addr+key(6.25B)"}; + + private CountPerScriptBigrams() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println( + "Usage: CountPerScriptBigrams <dataDir> [topK-per-script]"); + System.exit(1); + } + Path dataDir = Paths.get(args[0]); + int topK = args.length >= 2 ? Integer.parseInt(args[1]) : 0; + + List<Path> trainFiles = new ArrayList<>(); + try (Stream<Path> s = Files.list(dataDir)) { + s.filter(p -> p.getFileName().toString().endsWith(".train.gz")) + .sorted() + .forEach(trainFiles::add); + } + if (trainFiles.isEmpty()) { + System.err.println("ERROR: no *.train.gz files in " + dataDir); + System.exit(1); + } + + System.out.printf("Found %d *.train.gz files in %s%n%n", + trainFiles.size(), dataDir); + System.out.printf( + "%-22s %12s %12s %12s %12s | %s%n", + "script", "total_N", "distinct_U", "singletons", + "U(>=10)", "coverage: pairs needed for [50,75,90,95,99,99.9]%"); + System.out.println(repeat('-', 140)); + + long grandTotalN = 0; + long grandTotalU = 0; + long grandTotalUge2 = 0; + long grandTotalUge10 = 0; + + // Per-script size accumulators for the global-size summary at the end. + Map<String, long[]> perScriptStats = new HashMap<>(); + + for (Path trainFile : trainFiles) { + String fname = trainFile.getFileName().toString(); + String script = fname.substring(0, fname.length() - ".train.gz".length()) + .toUpperCase(); + + HashMap<Long, long[]> pairCounts = new HashMap<>(1 << 16); + long totalN = 0; + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(trainFile)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + int prevCp = -1; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + if (prevCp >= 0) { + long key = packPair(prevCp, cp); + long[] c = pairCounts.get(key); + if (c == null) { + pairCounts.put(key, new long[]{1L}); + } else { + c[0]++; + } + totalN++; + } + prevCp = cp; + } + } + } + + int distinctU = pairCounts.size(); + + long[] counts = new long[distinctU]; + int idx = 0; + for (long[] c : pairCounts.values()) { + counts[idx++] = c[0]; + } + // Sort descending for coverage curve. + java.util.Arrays.sort(counts); + // Reverse in place. + for (int i = 0, j = counts.length - 1; i < j; i++, j--) { + long t = counts[i]; + counts[i] = counts[j]; + counts[j] = t; + } + + int singletons = 0; + int uGe2 = 0; + int uGe10 = 0; + for (long c : counts) { + if (c == 1) singletons++; + if (c >= 2) uGe2++; + if (c >= 10) uGe10++; + } + + // Coverage thresholds: minimum k such that sum(counts[0..k-1]) / N >= t. + int[] coveragePairs = new int[COVERAGE_PCT.length + COVERAGE_FRAC_HI.length]; + double[] thresholds = new double[coveragePairs.length]; + for (int i = 0; i < COVERAGE_PCT.length; i++) { + thresholds[i] = COVERAGE_PCT[i] / 100.0; + } + for (int i = 0; i < COVERAGE_FRAC_HI.length; i++) { + thresholds[COVERAGE_PCT.length + i] = COVERAGE_FRAC_HI[i]; + } + long running = 0; + int tIdx = 0; + for (int k = 0; k < counts.length && tIdx < thresholds.length; k++) { + running += counts[k]; + while (tIdx < thresholds.length + && (double) running / totalN >= thresholds[tIdx]) { + coveragePairs[tIdx++] = k + 1; + } + } + // Fill any unreached thresholds with U (means: never reached, took all). + for (; tIdx < thresholds.length; tIdx++) { + coveragePairs[tIdx] = distinctU; + } + + StringBuilder cov = new StringBuilder(); + for (int i = 0; i < coveragePairs.length; i++) { + if (i > 0) cov.append(", "); + cov.append(String.format("%,d", coveragePairs[i])); + } + + System.out.printf("%-22s %,12d %,12d %,12d %,12d | %s%n", + script.toLowerCase(), + totalN, distinctU, singletons, uGe10, + cov.toString()); + + // Per-script size table. + if (topK > 0 || true) { + long[] sizeStats = new long[ + 2 + MIN_COUNT_CUTOFFS.length + BYTES_PER_PAIR_SCHEMES.length]; + sizeStats[0] = totalN; + sizeStats[1] = distinctU; + for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) { + int minC = MIN_COUNT_CUTOFFS[i]; + int kept = 0; + for (long c : counts) { + if (c >= minC) kept++; + else break; + } + sizeStats[2 + i] = kept; + } + perScriptStats.put(script.toLowerCase(), sizeStats); + } + + // Per-script top-K dump if requested. + if (topK > 0) { + System.out.printf(" top %d pairs in %s:%n", topK, script.toLowerCase()); + List<Map.Entry<Long, long[]>> sorted = new ArrayList<>(pairCounts.entrySet()); + sorted.sort((a, b) -> Long.compare(b.getValue()[0], a.getValue()[0])); + for (int i = 0; i < Math.min(topK, sorted.size()); i++) { + Map.Entry<Long, long[]> e = sorted.get(i); + long k = e.getKey(); + int cpA = (int) (k >>> 24); + int cpB = (int) (k & 0xFFFFFFL); + System.out.printf(" U+%04X U+%04X (%c %c) %,d%n", + cpA, cpB, + safePrint(cpA), safePrint(cpB), + e.getValue()[0]); + } + } + + grandTotalN += totalN; + grandTotalU += distinctU; + grandTotalUge2 += uGe2; + grandTotalUge10 += uGe10; + } + + System.out.println(repeat('-', 140)); + System.out.printf("%-22s %,12d %,12d %12s %,12d%n%n", + "TOTAL", grandTotalN, grandTotalU, + "-", grandTotalUge10); + + // ------------------------------------------------------------------ + // Cutoff vs. model-size summary + // ------------------------------------------------------------------ + System.out.println("=== Model-size estimates by min-count cutoff and storage scheme ==="); + System.out.println("(sum of retained pairs across all scripts × bytes-per-pair)"); + System.out.println(); + System.out.printf("%-12s", "cutoff"); + for (String name : SCHEME_NAMES) { + System.out.printf(" %20s", name); + } + System.out.printf(" %20s%n", "retained_pairs"); + System.out.println(repeat('-', 12 + (SCHEME_NAMES.length + 1) * 21)); + + for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) { + long retained = 0; + for (long[] stats : perScriptStats.values()) { + retained += stats[2 + i]; + } + System.out.printf("min_count>=%-2d", MIN_COUNT_CUTOFFS[i]); + for (double bpp : BYTES_PER_PAIR_SCHEMES) { + double bytes = retained * bpp; + System.out.printf(" %18s ", humanBytes(bytes)); + } + System.out.printf(" %,20d%n", retained); + } + + System.out.println(); + System.out.println("Per-script pair counts retained at each cutoff:"); + System.out.printf("%-22s", "script"); + for (int c : MIN_COUNT_CUTOFFS) { + System.out.printf(" %12s", ">=" + c); + } + System.out.println(); + List<Map.Entry<String, long[]>> sortedScripts = + new ArrayList<>(perScriptStats.entrySet()); + sortedScripts.sort(Comparator.comparingLong( + (Map.Entry<String, long[]> e) -> -e.getValue()[1])); + for (Map.Entry<String, long[]> e : sortedScripts) { + System.out.printf("%-22s", e.getKey()); + for (int i = 0; i < MIN_COUNT_CUTOFFS.length; i++) { + System.out.printf(" %,12d", e.getValue()[2 + i]); + } + System.out.println(); + } + } + + /** Pack two codepoints (each up to 21 bits) into a single long. */ + private static long packPair(int cpA, int cpB) { + return ((long) cpA << 24) | (cpB & 0xFFFFFFL); + } + + private static char safePrint(int cp) { + if (cp < 0x20 || cp == 0x7F || !Character.isDefined(cp)) { + return '.'; + } + if (Character.charCount(cp) != 1) { + return '?'; + } + return (char) cp; + } + + private static String repeat(char c, int n) { + char[] buf = new char[n]; + java.util.Arrays.fill(buf, c); + return new String(buf); + } + + private static String humanBytes(double bytes) { + if (bytes < 1024) return String.format("%.0f B", bytes); + if (bytes < 1024 * 1024) return String.format("%.1f KB", bytes / 1024.0); + if (bytes < 1024L * 1024 * 1024) return String.format("%.2f MB", bytes / (1024.0 * 1024)); + return String.format("%.2f GB", bytes / (1024.0 * 1024 * 1024)); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java new file mode 100644 index 0000000000..bcda57c9f7 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/LineScriptFractions.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.zip.GZIPInputStream; + +/** + * For each {@code *.train.gz} file in a directory, compute per-line statistics + * of "target-script fraction" — i.e. the fraction of codepoints in each line + * that belong to the script the file is supposed to represent. + * + * <p>Reports a histogram across the buckets + * [0, 5, 10, 20, 30, 50, 70, 90, 100]% so we can pick a per-script keep + * threshold (e.g. "drop lines with <20% HAN codepoints"). Also reports + * what fraction of total bytes / lines would be dropped at each threshold. + * + * <p>Each {@code {script}.train.gz} maps to a {@link Character.UnicodeScript}; + * the file basename is uppercased. Special-case handling routes a few + * project-internal script names (e.g. HAN includes HALF_FULL ideographic + * forms) when desired. + * + * <p>Usage: + * <pre> + * java LineScriptFractions <dataDir> [thresholds] + * </pre> + */ +public final class LineScriptFractions { + + private static final int[] BUCKETS = {0, 5, 10, 20, 30, 50, 70, 90, 100}; + + private LineScriptFractions() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: LineScriptFractions <dataDir>"); + System.exit(1); + } + Path dataDir = Paths.get(args[0]); + Path[] files; + try (var s = Files.list(dataDir)) { + files = s.filter(p -> p.getFileName().toString().endsWith(".train.gz")) + .sorted().toArray(Path[]::new); + } + if (files.length == 0) { + System.err.println("No *.train.gz files in " + dataDir); + System.exit(1); + } + + System.out.printf("%-20s %10s %10s | %s%n", + "script", "lines", "<5%", + "lines at target-frac threshold (cumulative dropped %)"); + System.out.println(" " + + " <10% <20% <30% <50% <70% <90% <100%"); + System.out.println(repeat('-', 110)); + + for (Path file : files) { + String fname = file.getFileName().toString(); + String name = fname.substring(0, fname.length() - ".train.gz".length()) + .toUpperCase(); + Character.UnicodeScript target = mapScript(name); + if (target == null) { + System.out.printf("%-20s (no UnicodeScript mapping for '%s')%n", name, name); + continue; + } + + long lines = 0; + long[] bucketCounts = new long[BUCKETS.length]; + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(file)), + StandardCharsets.UTF_8))) { + String line; + while ((line = r.readLine()) != null) { + lines++; + int total = 0; + int matching = 0; + for (int i = 0; i < line.length(); ) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED + || s == Character.UnicodeScript.UNKNOWN) { + // Don't count toward denominator: punctuation, + // spaces, diacritics are script-neutral. + continue; + } + total++; + if (s == target) matching++; + } + double pct = total == 0 ? 0.0 : 100.0 * matching / total; + int b = 0; + while (b < BUCKETS.length - 1 && pct >= BUCKETS[b + 1]) b++; + bucketCounts[b]++; + } + } + + // Convert bucket counts to "cumulative fraction dropped at threshold = BUCKETS[i]". + StringBuilder sb = new StringBuilder(); + long cum = 0; + // bucketCounts[i] holds lines with pct in [BUCKETS[i], BUCKETS[i+1]). + // Drop-if-pct<T means drop all bucketCounts[j] with BUCKETS[j+1] <= T. + // We report drop-fraction for thresholds 10, 20, 30, 50, 70, 90, 100. + int[] thresholds = {10, 20, 30, 50, 70, 90, 100}; + for (int t : thresholds) { + long dropped = 0; + for (int j = 0; j < BUCKETS.length; j++) { + int hi = (j == BUCKETS.length - 1) ? 101 : BUCKETS[j + 1]; + if (hi <= t) dropped += bucketCounts[j]; + } + double pct = 100.0 * dropped / Math.max(1, lines); + sb.append(String.format(" %6.1f", pct)); + } + + long below5 = bucketCounts[0]; + System.out.printf("%-20s %,10d %,10d |%s%n", + name.toLowerCase(), lines, below5, sb.toString()); + } + } + + private static Character.UnicodeScript mapScript(String name) { + try { + return Character.UnicodeScript.valueOf(name); + } catch (IllegalArgumentException e) { + return null; + } + } + + private static String repeat(char c, int n) { + char[] b = new char[n]; + java.util.Arrays.fill(b, c); + return new String(b); + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java new file mode 100644 index 0000000000..b384d5f4c5 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/ScriptCensus.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * Codepoint-level script census of one or more text files. For each input + * file, reports the percentage of codepoints in each {@link + * Character.UnicodeScript}, optionally per-line script-mix histograms. + * + * <p>Useful to verify whether {@code BuildJunkTrainingData} is bucketing + * languages correctly: e.g. Japanese is usually a mix of HIRAGANA, KATAKANA + * and HAN; if {@code jpn} ends up in {@code han.train.gz} we want to know + * what fraction of its codepoints are actually Han ideographs vs. kana. + * + * <p>Usage: + * <pre> + * java ScriptCensus <file> [file ...] # supports .gz and plain text + * </pre> + */ +public final class ScriptCensus { + + /** Max lines to sample per file (set high for full pass). */ + private static final int MAX_LINES = 200_000; + + private ScriptCensus() {} + + public static void main(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: ScriptCensus <file> [file ...]"); + System.exit(1); + } + for (String arg : args) { + Path f = Paths.get(arg); + if (!Files.isRegularFile(f)) { + System.err.println("Skipping non-file: " + f); + continue; + } + reportOne(f); + System.out.println(); + } + } + + private static void reportOne(Path file) throws IOException { + Map<String, long[]> scriptCounts = new HashMap<>(); + // Per-line dominant-script histogram. + Map<String, long[]> dominantHistogram = new HashMap<>(); + long total = 0; + long lines = 0; + long sampledBytes = 0; + + try (BufferedReader r = open(file)) { + String line; + while ((line = r.readLine()) != null && lines < MAX_LINES) { + lines++; + sampledBytes += line.length(); + // For MADLAD/Wikipedia files the format is "lineNum TAB text"; + // strip the prefix if present. + int tab = line.indexOf('\t'); + String text = tab >= 0 ? line.substring(tab + 1) : line; + + Map<String, Long> perLine = new HashMap<>(); + for (int i = 0; i < text.length(); ) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s == Character.UnicodeScript.COMMON + || s == Character.UnicodeScript.INHERITED + || s == Character.UnicodeScript.UNKNOWN) { + continue; + } + String name = s.name(); + scriptCounts.computeIfAbsent(name, k -> new long[1])[0]++; + perLine.merge(name, 1L, Long::sum); + total++; + } + // Identify the dominant script for this line. + String dom = null; + long best = -1; + for (Map.Entry<String, Long> e : perLine.entrySet()) { + if (e.getValue() > best) { + best = e.getValue(); + dom = e.getKey(); + } + } + if (dom != null) { + dominantHistogram.computeIfAbsent(dom, k -> new long[1])[0]++; + } + } + } + + System.out.printf("File: %s%n", file); + System.out.printf(" lines sampled: %,d total codepoints (excl. COMMON/INHERITED): %,d%n%n", + lines, total); + + if (total == 0) { + System.out.println(" (empty / no scripted codepoints)"); + return; + } + + System.out.println(" Codepoint distribution by script:"); + List<Map.Entry<String, long[]>> sorted = new ArrayList<>(scriptCounts.entrySet()); + sorted.sort(Comparator.comparingLong((Map.Entry<String, long[]> e) -> -e.getValue()[0])); + long cumulative = 0; + for (Map.Entry<String, long[]> e : sorted) { + long c = e.getValue()[0]; + cumulative += c; + double pct = 100.0 * c / total; + double cumPct = 100.0 * cumulative / total; + if (pct < 0.01 && c < 100) continue; + System.out.printf(" %-22s %,14d %6.2f%% (cum %6.2f%%)%n", + e.getKey(), c, pct, cumPct); + } + + System.out.println(); + System.out.println(" Per-line dominant-script histogram:"); + List<Map.Entry<String, long[]>> dom = new ArrayList<>(dominantHistogram.entrySet()); + dom.sort(Comparator.comparingLong((Map.Entry<String, long[]> e) -> -e.getValue()[0])); + long domTotal = 0; + for (long[] v : dominantHistogram.values()) domTotal += v[0]; + for (Map.Entry<String, long[]> e : dom) { + long c = e.getValue()[0]; + double pct = 100.0 * c / domTotal; + if (pct < 0.05) continue; + System.out.printf(" %-22s %,12d %6.2f%% of lines%n", + e.getKey(), c, pct); + } + } + + private static BufferedReader open(Path path) throws IOException { + if (path.getFileName().toString().endsWith(".gz")) { + return new BufferedReader(new InputStreamReader( + new GZIPInputStream(Files.newInputStream(path)), + StandardCharsets.UTF_8)); + } + return Files.newBufferedReader(path, StandardCharsets.UTF_8); + } +}
