This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch junk-detector-v6 in repository https://gitbox.apache.org/repos/asf/tika.git
commit a24d53259ebbf0cfc9f7ceaf365fc78a7348bbe4 Author: tballison <[email protected]> AuthorDate: Thu May 14 13:24:47 2026 -0400 checkpoint --- .../ml/junkdetect/tools/BuildJunkTrainingData.java | 176 ++++++++++++++++++--- 1 file changed, 157 insertions(+), 19 deletions(-) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java index 27a5436d5e..26ba0b9732 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java @@ -102,12 +102,33 @@ public class BuildJunkTrainingData { */ private static final long DEFAULT_TOTAL_BUDGET_BYTES = 50_000_000L; + /** + * Maximum UTF-8 bytes any single language may contribute to its script + * bucket. Prevents one language (e.g. {@code zho} with 8 GB of MADLAD) + * from dominating a multi-language script. Languages with less than this + * available take what they have; languages above the cap get truncated. + * Default {@code 5 MB} balances diversity against per-language coverage. + */ + private static final long DEFAULT_PER_LANGUAGE_CAP_BYTES = 5_000_000L; + /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */ private static final int DEFAULT_MIN_BYTES = 50; /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */ private static final double DEFAULT_MAX_PUNC_FRAC = 0.30; + /** + * Minimum fraction of a sentence's non-COMMON/INHERITED codepoints that + * must belong to the script bucket's target script for the sentence to be + * accepted. Lines whose target-script fraction falls below this floor are + * dropped — typically these are off-target Wikipedia stubs (e.g. an article + * about Gothic written almost entirely in English). Set very low by + * default so that legitimate mixed-script content (Japanese with kanji + + * kana, Korean with hanja annotations, Chinese with English citations) is + * preserved. + */ + private static final double DEFAULT_MIN_TARGET_SCRIPT_FRAC = 0.05; + /** Fraction of sentences written to each split (train / dev / test = 80/10/10). */ private static final double TRAIN_FRAC = 0.80; private static final double DEV_FRAC = 0.10; @@ -131,11 +152,14 @@ public class BuildJunkTrainingData { Path outputDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "junkdetect"); int scriptSampleLines = DEFAULT_SCRIPT_SAMPLE_LINES; long totalBudgetBytes = DEFAULT_TOTAL_BUDGET_BYTES; + long perLanguageCapBytes = DEFAULT_PER_LANGUAGE_CAP_BYTES; int minBytes = DEFAULT_MIN_BYTES; double maxPuncFrac = DEFAULT_MAX_PUNC_FRAC; + double minTargetScriptFrac = DEFAULT_MIN_TARGET_SCRIPT_FRAC; int seed = 42; boolean dryRun = false; int minDevSentences = DEFAULT_MIN_DEV_SENTENCES; + java.util.Set<String> dropScripts = new java.util.HashSet<>(); for (int i = 0; i < args.length; i++) { switch (args[i]) { @@ -151,18 +175,30 @@ public class BuildJunkTrainingData { case "--total-budget-bytes": totalBudgetBytes = Long.parseLong(args[++i]); break; + case "--per-language-cap-bytes": + perLanguageCapBytes = Long.parseLong(args[++i]); + break; case "--min-bytes": minBytes = Integer.parseInt(args[++i]); break; case "--max-punc-frac": maxPuncFrac = Double.parseDouble(args[++i]); break; + case "--min-target-script-frac": + minTargetScriptFrac = Double.parseDouble(args[++i]); + break; case "--seed": seed = Integer.parseInt(args[++i]); break; case "--min-dev-sentences": minDevSentences = Integer.parseInt(args[++i]); break; + case "--drop-scripts": + for (String s : args[++i].split(",")) { + String t = s.trim().toUpperCase(); + if (!t.isEmpty()) dropScripts.add(t); + } + break; case "--dry-run": dryRun = true; break; @@ -176,13 +212,19 @@ public class BuildJunkTrainingData { System.out.println("=== BuildJunkTrainingData ==="); System.out.println(" data-dir: " + dataDir); System.out.println(" output-dir: " + outputDir); - System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n", + System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n", totalBudgetBytes, totalBudgetBytes / 1_000_000.0); - System.out.printf( " min-bytes: %d%n", minBytes); - System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac); - System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n", + System.out.printf( " per-language-cap: %,d (%.1f MB)%n", + perLanguageCapBytes, perLanguageCapBytes / 1_000_000.0); + System.out.printf( " min-bytes: %d%n", minBytes); + System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac); + System.out.printf( " min-target-script-frac: %.2f%n", minTargetScriptFrac); + System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n", minDevSentences, (int)(minDevSentences / DEV_FRAC)); - System.out.println(" dry-run: " + dryRun); + if (!dropScripts.isEmpty()) { + System.out.println(" drop-scripts: " + dropScripts); + } + System.out.println(" dry-run: " + dryRun); if (!Files.isDirectory(dataDir)) { System.err.println("ERROR: data-dir not found: " + dataDir); @@ -208,6 +250,15 @@ public class BuildJunkTrainingData { System.out.printf(" %-12s → %s%n", lang, script); } } + + if (!dropScripts.isEmpty()) { + for (String s : dropScripts) { + if (scriptGroups.remove(s) != null) { + System.out.printf(" DROP script: %s%n", s); + } + } + } + System.out.printf("%n → %d languages, %d script groups%n", langToScript.size(), scriptGroups.size()); @@ -273,8 +324,16 @@ public class BuildJunkTrainingData { String script = budgetEntry.getKey(); long budget = budgetEntry.getValue(); List<Path> langDirs = scriptGroups.get(script); + Character.UnicodeScript targetScript = parseUnicodeScript(script); long perLangBytes = Math.max(budget / langDirs.size(), 1L); + // Apply per-language cap on top of the even split, but only for + // multi-language buckets. For single-language scripts (e.g. KHMER, + // HANGUL), the cap would needlessly limit a bucket that has only + // one source; let it consume its full budget instead. + long capPerLang = langDirs.size() > 1 + ? Math.min(perLangBytes, perLanguageCapBytes) + : perLangBytes; List<String> sentences = new ArrayList<>(); long totalBytesLoaded = 0; @@ -282,8 +341,10 @@ public class BuildJunkTrainingData { long remaining = budget - totalBytesLoaded; if (remaining <= 0) break; long langBytes = loadSentences(langDir, - Math.min(perLangBytes, remaining), - minBytes, maxPuncFrac, sentences); + Math.min(capPerLang, remaining), + minBytes, maxPuncFrac, + targetScript, minTargetScriptFrac, + sentences); totalBytesLoaded += langBytes; if (langBytes > 0) { System.out.printf(" %-12s %-20s +%,d bytes%n", @@ -327,7 +388,11 @@ public class BuildJunkTrainingData { long newBudget = budget + extra; List<Path> langDirs = scriptGroups.get(script); + Character.UnicodeScript targetScript = parseUnicodeScript(script); long perLangBytes = Math.max(newBudget / langDirs.size(), 1L); + long capPerLang = langDirs.size() > 1 + ? Math.min(perLangBytes, perLanguageCapBytes) + : perLangBytes; List<String> sentences = new ArrayList<>(); long totalBytesLoaded = 0; @@ -335,8 +400,10 @@ public class BuildJunkTrainingData { long remaining = newBudget - totalBytesLoaded; if (remaining <= 0) break; long langBytes = loadSentences(langDir, - Math.min(perLangBytes, remaining), - minBytes, maxPuncFrac, sentences); + Math.min(capPerLang, remaining), + minBytes, maxPuncFrac, + targetScript, minTargetScriptFrac, + sentences); totalBytesLoaded += langBytes; } if (!sentences.isEmpty()) { @@ -415,6 +482,21 @@ public class BuildJunkTrainingData { System.out.println("Done."); } + /** + * Parses a script-bucket name (e.g. {@code "HAN"}) into a + * {@link Character.UnicodeScript}, or returns {@code null} if the name + * does not correspond to a real script (e.g. {@code "COMMON"} or any + * future synthetic bucket). Used by the corpus builder to look up the + * target script for the {@code min-target-script-frac} filter. + */ + static Character.UnicodeScript parseUnicodeScript(String name) { + try { + return Character.UnicodeScript.valueOf(name); + } catch (IllegalArgumentException e) { + return null; + } + } + // ----------------------------------------------------------------------- // Script detection // ----------------------------------------------------------------------- @@ -531,6 +613,22 @@ public class BuildJunkTrainingData { */ static long loadSentences(Path langDir, long maxBytes, int minBytes, double maxPuncFrac, List<String> result) { + // Backwards-compatible overload: no target-script filter. + return loadSentences(langDir, maxBytes, minBytes, maxPuncFrac, + null, 0.0, result); + } + + /** + * Same as the 5-arg overload, but additionally drops sentences whose + * fraction of {@code targetScript} codepoints (relative to all non- + * COMMON/INHERITED codepoints) is below {@code minTargetScriptFrac}. + * Passing {@code targetScript == null} disables the target-script filter. + */ + static long loadSentences(Path langDir, long maxBytes, int minBytes, + double maxPuncFrac, + Character.UnicodeScript targetScript, + double minTargetScriptFrac, + List<String> result) { long bytesLoaded = 0; for (String filename : new String[]{"sentences_wikipedia.txt", "sentences_madlad.txt"}) { if (bytesLoaded >= maxBytes) { @@ -553,7 +651,8 @@ public class BuildJunkTrainingData { if (text.isEmpty()) { continue; } - String filtered = filterSentence(text, minBytes, maxPuncFrac); + String filtered = filterSentence(text, minBytes, maxPuncFrac, + targetScript, minTargetScriptFrac); if (filtered != null) { int sentBytes = filtered.getBytes(StandardCharsets.UTF_8).length; result.add(filtered); @@ -577,6 +676,18 @@ public class BuildJunkTrainingData { * @return the normalised sentence, or {@code null} if it should be discarded */ static String filterSentence(String text, int minBytes, double maxPuncFrac) { + return filterSentence(text, minBytes, maxPuncFrac, null, 0.0); + } + + /** + * Same as the 3-arg overload, but additionally rejects sentences whose + * fraction of {@code targetScript} codepoints (over non-COMMON/INHERITED + * codepoints) is below {@code minTargetScriptFrac}. If {@code + * targetScript == null} the target-script filter is skipped. + */ + static String filterSentence(String text, int minBytes, double maxPuncFrac, + Character.UnicodeScript targetScript, + double minTargetScriptFrac) { if (text.indexOf('\uFFFD') >= 0) { return null; } @@ -586,17 +697,34 @@ public class BuildJunkTrainingData { } int cpCount = 0; int puncCount = 0; + int scriptCpTotal = 0; + int scriptCpMatching = 0; for (int i = 0; i < text.length(); ) { int cp = text.codePointAt(i); cpCount++; if (cp >= 0x21 && cp <= 0x7E && !Character.isLetter(cp)) { puncCount++; } + if (targetScript != null) { + Character.UnicodeScript s = Character.UnicodeScript.of(cp); + if (s != Character.UnicodeScript.COMMON + && s != Character.UnicodeScript.INHERITED + && s != Character.UnicodeScript.UNKNOWN) { + scriptCpTotal++; + if (s == targetScript) { + scriptCpMatching++; + } + } + } i += Character.charCount(cp); } if (cpCount > 0 && (double) puncCount / cpCount > maxPuncFrac) { return null; } + if (targetScript != null && scriptCpTotal > 0 + && (double) scriptCpMatching / scriptCpTotal < minTargetScriptFrac) { + return null; + } return text; } @@ -624,23 +752,33 @@ public class BuildJunkTrainingData { private static void printUsage() { System.err.println("Usage: BuildJunkTrainingData [options]"); - System.err.println(" --data-dir <path> MADLAD data root" + System.err.println(" --data-dir <path> MADLAD data root" + " (default: ~/datasets/madlad/data)"); - System.err.println(" --output-dir <path> Output directory" + System.err.println(" --output-dir <path> Output directory" + " (default: ~/datasets/madlad/junkdetect)"); - System.err.println(" --script-sample-lines N Lines per language for script" + System.err.println(" --script-sample-lines N Lines per language for script" + " detection (default: 2000)"); - System.err.println(" --total-budget-bytes N Total UTF-8 bytes across all" + System.err.println(" --total-budget-bytes N Total UTF-8 bytes across all" + " scripts (default: 50000000)"); - System.err.println(" --min-bytes N Min UTF-8 bytes per sentence" + System.err.println(" --per-language-cap-bytes N Max UTF-8 bytes contributed by any" + + " single language to its script bucket (default: 5000000). Prevents one large" + + " language source from dominating a multi-language bucket."); + System.err.println(" --min-bytes N Min UTF-8 bytes per sentence" + " (default: 50)"); - System.err.println(" --max-punc-frac F Max ASCII punct fraction" + System.err.println(" --max-punc-frac F Max ASCII punct fraction" + " (default: 0.30)"); - System.err.println(" --min-dev-sentences N Min sentences in dev split for a" + System.err.println(" --min-target-script-frac F Min fraction of non-COMMON cps that" + + " must be in the bucket's target script for a sentence to be kept" + + " (default: 0.05). Filters off-target Wikipedia stubs (e.g. English-about-Gothic" + + " articles in the GOTHIC bucket)."); + System.err.println(" --min-dev-sentences N Min sentences in dev split for a" + " script to be included (default: 500). Scripts below this floor" + " have unreliable calibration and inflated FPR."); - System.err.println(" --seed N Random seed (default: 42)"); - System.err.println(" --dry-run Detect scripts + show budget," + System.err.println(" --drop-scripts S,S,.. Comma-separated script bucket names" + + " to exclude (e.g. GOTHIC,THAANA). Use when source data is too thin or off-" + + " target for reliable distribution estimates."); + System.err.println(" --seed N Random seed (default: 42)"); + System.err.println(" --dry-run Detect scripts + show budget," + " skip file writing"); } }
