This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch junk-detector-v6 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8c91c28a5885f445201a97de6d075a1fc28b3e0d Author: tballison <[email protected]> AuthorDate: Thu May 14 13:42:03 2026 -0400 junk-detector: move training choices into JunkDetectorTrainingConfig Replaces per-tool CLI flags for durable training/build parameters with a single committed config class. CLI surface of the two tools shrinks to data-dir, output(-dir), and (for BuildJunkTrainingData) --dry-run. Any attempt to pass a now-removed flag like --total-budget-bytes or --min-bigram-count is rejected with a pointer to the config file. Rationale: we've repeatedly burned cycles asking "wait, which run was that?" when a model file's identity depended on shell history rather than tracked source. With this change every parameter that affects the model lives in code that's reviewed and grep-able from a commit hash. The config values pin the current shipping setup: 500 MB total budget with a 5 MB per-language cap, 5% target-script-fraction line filter, GOTHIC and THAANA dropped, min_bigram_count = 3, 16 Mbit Bloom. These together produce macro Cohen's d = 12.11 / FPR = 0.004 / TPR = 0.894 on the dev split (vs. honest v6 baseline of 9.81 / 0.017 / 0.865). The smoke-rerun produced a model file whose MD5 matches the prior CLI- flag-driven v2 model byte-for-byte; the refactor is provably behavior- preserving. Format-tied constants (V6_BIGRAM_BUCKETS, V6_FNV_SEED, etc.) stay in TrainJunkModel — they're part of the v6 binary protocol, not tunable training choices, and moving them would muddy the distinction. Test JunkDetectorTrainingConfigTest pins the current values so any future change has to land alongside an explicit assertion update. 29 tests pass (24 previous + 5 new). Co-authored-by: Cursor <[email protected]> --- .../ml/junkdetect/tools/BuildJunkTrainingData.java | 195 ++++++++------------- .../tools/JunkDetectorTrainingConfig.java | 179 +++++++++++++++++++ .../tika/ml/junkdetect/tools/TrainJunkModel.java | 55 +++--- .../tools/JunkDetectorTrainingConfigTest.java | 101 +++++++++++ 4 files changed, 381 insertions(+), 149 deletions(-) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java index 26ba0b9732..a80fafbd6b 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java @@ -82,66 +82,18 @@ import java.util.zip.GZIPOutputStream; public class BuildJunkTrainingData { // ----------------------------------------------------------------------- - // Defaults + // Split ratios — fixed, part of the model identity (changing them would + // invalidate downstream eval comparisons). // ----------------------------------------------------------------------- - /** Lines read per language to determine dominant script. */ - private static final int DEFAULT_SCRIPT_SAMPLE_LINES = 2_000; - - /** - * UTF-8 bytes loaded per script group for entropy estimation. - * Budget is spread evenly across languages in the group. - * 200KB is enough to observe the bigram distribution reliably. - */ - private static final long ENTROPY_SAMPLE_BYTES = 200_000L; - - /** - * Total UTF-8 byte budget across all script groups. Divided proportionally - * by bigram entropy after the sampling phase. 50MB gives ~1–3MB per script - * on average across 34 groups; scale up for production runs. - */ - private static final long DEFAULT_TOTAL_BUDGET_BYTES = 50_000_000L; - - /** - * Maximum UTF-8 bytes any single language may contribute to its script - * bucket. Prevents one language (e.g. {@code zho} with 8 GB of MADLAD) - * from dominating a multi-language script. Languages with less than this - * available take what they have; languages above the cap get truncated. - * Default {@code 5 MB} balances diversity against per-language coverage. - */ - private static final long DEFAULT_PER_LANGUAGE_CAP_BYTES = 5_000_000L; - - /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */ - private static final int DEFAULT_MIN_BYTES = 50; - - /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */ - private static final double DEFAULT_MAX_PUNC_FRAC = 0.30; - - /** - * Minimum fraction of a sentence's non-COMMON/INHERITED codepoints that - * must belong to the script bucket's target script for the sentence to be - * accepted. Lines whose target-script fraction falls below this floor are - * dropped — typically these are off-target Wikipedia stubs (e.g. an article - * about Gothic written almost entirely in English). Set very low by - * default so that legitimate mixed-script content (Japanese with kanji + - * kana, Korean with hanja annotations, Chinese with English citations) is - * preserved. - */ - private static final double DEFAULT_MIN_TARGET_SCRIPT_FRAC = 0.05; - /** Fraction of sentences written to each split (train / dev / test = 80/10/10). */ private static final double TRAIN_FRAC = 0.80; private static final double DEV_FRAC = 0.10; // remaining (1 - TRAIN_FRAC - DEV_FRAC) goes to the test split - /** - * Minimum number of sentences that must land in the dev split for a script to be - * included in the model. Scripts below this floor have too few samples to reliably - * estimate calibration statistics (mu/sigma), which produces noisy z-scores and - * inflated false positive rates. With DEV_FRAC=0.10 the effective minimum total - * sentence count is minDevSentences / DEV_FRAC (default: 5,000 total sentences). - */ - private static final int DEFAULT_MIN_DEV_SENTENCES = 500; + // All other durable parameters live in JunkDetectorTrainingConfig. This + // tool deliberately does not accept CLI overrides for those values; see + // the rejection logic in main() below. // ----------------------------------------------------------------------- // Entry point @@ -150,16 +102,22 @@ public class BuildJunkTrainingData { public static void main(String[] args) throws IOException { Path dataDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "data"); Path outputDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "junkdetect"); - int scriptSampleLines = DEFAULT_SCRIPT_SAMPLE_LINES; - long totalBudgetBytes = DEFAULT_TOTAL_BUDGET_BYTES; - long perLanguageCapBytes = DEFAULT_PER_LANGUAGE_CAP_BYTES; - int minBytes = DEFAULT_MIN_BYTES; - double maxPuncFrac = DEFAULT_MAX_PUNC_FRAC; - double minTargetScriptFrac = DEFAULT_MIN_TARGET_SCRIPT_FRAC; - int seed = 42; boolean dryRun = false; - int minDevSentences = DEFAULT_MIN_DEV_SENTENCES; - java.util.Set<String> dropScripts = new java.util.HashSet<>(); + + // Bind config-controlled values into local variables. These are + // read-only from this point on; any attempt to override them via CLI + // is rejected below. + long totalBudgetBytes = JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES; + long perLanguageCapBytes = JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES; + int minBytes = JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE; + double maxPuncFrac = JunkDetectorTrainingConfig.MAX_PUNC_FRAC; + double minTargetScriptFrac = JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC; + int minDevSentences = JunkDetectorTrainingConfig.MIN_DEV_SENTENCES; + int scriptSampleLines = JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES; + int seed = JunkDetectorTrainingConfig.SEED; + java.util.Set<String> dropScripts = JunkDetectorTrainingConfig.DROP_SCRIPTS; + Map<String, Long> scriptBudgetOverrides = + JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES; for (int i = 0; i < args.length; i++) { switch (args[i]) { @@ -169,38 +127,25 @@ public class BuildJunkTrainingData { case "--output-dir": outputDir = Paths.get(args[++i]); break; - case "--script-sample-lines": - scriptSampleLines = Integer.parseInt(args[++i]); + case "--dry-run": + dryRun = true; break; + // Durable parameters are config-controlled. Refuse any CLI + // override so that a model file's identity always matches the + // committed config. + case "--script-sample-lines": case "--total-budget-bytes": - totalBudgetBytes = Long.parseLong(args[++i]); - break; case "--per-language-cap-bytes": - perLanguageCapBytes = Long.parseLong(args[++i]); - break; case "--min-bytes": - minBytes = Integer.parseInt(args[++i]); - break; case "--max-punc-frac": - maxPuncFrac = Double.parseDouble(args[++i]); - break; case "--min-target-script-frac": - minTargetScriptFrac = Double.parseDouble(args[++i]); - break; case "--seed": - seed = Integer.parseInt(args[++i]); - break; case "--min-dev-sentences": - minDevSentences = Integer.parseInt(args[++i]); - break; case "--drop-scripts": - for (String s : args[++i].split(",")) { - String t = s.trim().toUpperCase(); - if (!t.isEmpty()) dropScripts.add(t); - } - break; - case "--dry-run": - dryRun = true; + case "--script-budget-override": + System.err.println("ERROR: " + args[i] + " is no longer a CLI option." + + " Edit JunkDetectorTrainingConfig and commit the change instead."); + System.exit(1); break; default: System.err.println("Unknown argument: " + args[i]); @@ -210,21 +155,26 @@ public class BuildJunkTrainingData { } System.out.println("=== BuildJunkTrainingData ==="); - System.out.println(" data-dir: " + dataDir); - System.out.println(" output-dir: " + outputDir); - System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n", + System.out.println(" data-dir: " + dataDir); + System.out.println(" output-dir: " + outputDir); + System.out.println(" --- config (JunkDetectorTrainingConfig) ---"); + System.out.printf( " total-budget-bytes: %,d (%.1f MB)%n", totalBudgetBytes, totalBudgetBytes / 1_000_000.0); - System.out.printf( " per-language-cap: %,d (%.1f MB)%n", + System.out.printf( " per-language-cap: %,d (%.1f MB)%n", perLanguageCapBytes, perLanguageCapBytes / 1_000_000.0); - System.out.printf( " min-bytes: %d%n", minBytes); - System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac); + System.out.printf( " min-bytes: %d%n", minBytes); + System.out.printf( " max-punc-frac: %.2f%n", maxPuncFrac); System.out.printf( " min-target-script-frac: %.2f%n", minTargetScriptFrac); - System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n", + System.out.printf( " min-dev-sentences: %d (min total ≈ %d)%n", minDevSentences, (int)(minDevSentences / DEV_FRAC)); + System.out.printf( " seed: %d%n", seed); if (!dropScripts.isEmpty()) { - System.out.println(" drop-scripts: " + dropScripts); + System.out.println(" drop-scripts: " + dropScripts); } - System.out.println(" dry-run: " + dryRun); + if (!scriptBudgetOverrides.isEmpty()) { + System.out.println(" script-budget-override: " + scriptBudgetOverrides); + } + System.out.println(" dry-run: " + dryRun); if (!Files.isDirectory(dataDir)) { System.err.println("ERROR: data-dir not found: " + dataDir); @@ -273,7 +223,8 @@ public class BuildJunkTrainingData { String script = entry.getKey(); List<Path> langDirs = entry.getValue(); - long perLangSampleBytes = Math.max(ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L); + long perLangSampleBytes = Math.max( + JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES / langDirs.size(), 2_000L); List<String> sample = new ArrayList<>(); for (Path langDir : langDirs) { loadSentences(langDir, perLangSampleBytes, minBytes, maxPuncFrac, sample); @@ -297,9 +248,25 @@ public class BuildJunkTrainingData { Map<String, Long> scriptBudget = new TreeMap<>(); for (Map.Entry<String, Double> e : scriptEntropy.entrySet()) { long budget = (long) (totalBudgetBytes * e.getValue() / totalEntropy); + Long override = scriptBudgetOverrides.get(e.getKey()); + if (override != null) { + System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)" + + " [OVERRIDE: was %,d (%.1f MB)]%n", + e.getKey(), e.getValue(), override, override / 1_000_000.0, + budget, budget / 1_000_000.0); + budget = override; + } else { + System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)%n", + e.getKey(), e.getValue(), budget, budget / 1_000_000.0); + } scriptBudget.put(e.getKey(), budget); - System.out.printf(" %-20s H=%.3f → %,d bytes (%.1f MB)%n", - e.getKey(), e.getValue(), budget, budget / 1_000_000.0); + } + // Warn about overrides for scripts that aren't in the bucket set. + for (String k : scriptBudgetOverrides.keySet()) { + if (!scriptBudget.containsKey(k)) { + System.err.printf("WARNING: --script-budget-override for %s ignored" + + " (script not in bucket set)%n", k); + } } if (dryRun) { @@ -752,33 +719,15 @@ public class BuildJunkTrainingData { private static void printUsage() { System.err.println("Usage: BuildJunkTrainingData [options]"); - System.err.println(" --data-dir <path> MADLAD data root" + System.err.println(" --data-dir <path> MADLAD data root" + " (default: ~/datasets/madlad/data)"); - System.err.println(" --output-dir <path> Output directory" + System.err.println(" --output-dir <path> Output directory" + " (default: ~/datasets/madlad/junkdetect)"); - System.err.println(" --script-sample-lines N Lines per language for script" - + " detection (default: 2000)"); - System.err.println(" --total-budget-bytes N Total UTF-8 bytes across all" - + " scripts (default: 50000000)"); - System.err.println(" --per-language-cap-bytes N Max UTF-8 bytes contributed by any" - + " single language to its script bucket (default: 5000000). Prevents one large" - + " language source from dominating a multi-language bucket."); - System.err.println(" --min-bytes N Min UTF-8 bytes per sentence" - + " (default: 50)"); - System.err.println(" --max-punc-frac F Max ASCII punct fraction" - + " (default: 0.30)"); - System.err.println(" --min-target-script-frac F Min fraction of non-COMMON cps that" - + " must be in the bucket's target script for a sentence to be kept" - + " (default: 0.05). Filters off-target Wikipedia stubs (e.g. English-about-Gothic" - + " articles in the GOTHIC bucket)."); - System.err.println(" --min-dev-sentences N Min sentences in dev split for a" - + " script to be included (default: 500). Scripts below this floor" - + " have unreliable calibration and inflated FPR."); - System.err.println(" --drop-scripts S,S,.. Comma-separated script bucket names" - + " to exclude (e.g. GOTHIC,THAANA). Use when source data is too thin or off-" - + " target for reliable distribution estimates."); - System.err.println(" --seed N Random seed (default: 42)"); - System.err.println(" --dry-run Detect scripts + show budget," - + " skip file writing"); + System.err.println(" --dry-run Detect scripts + show budget," + + " skip file writing."); + System.err.println(); + System.err.println("All other training/build parameters (budgets, filters, dropped" + + " scripts, seed, etc.) are fixed in JunkDetectorTrainingConfig and tracked" + + " in git. Edit that file and commit to change them."); } } diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java new file mode 100644 index 0000000000..15600a96c6 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; + +/** + * Frozen set of training-time choices that together define a junk-detector + * model's identity. Any change to these values produces a meaningfully + * different model and must be reviewed in git. + * + * <p>Two principles drove making this a class rather than CLI flags: + * + * <ol> + * <li><b>Reproducibility.</b> When we look back at a model file six + * months later we want a single commit hash that says exactly what + * knobs produced it, not a half-remembered shell history. + * <li><b>Drift prevention.</b> CLI flags with defaults allow accidental + * deviation between developers ("did you remember to pass + * {@code --min-target-script-frac 0.05}?"). Constants in a tracked + * file remove that failure mode. + * </ol> + * + * <p>{@link BuildJunkTrainingData} and {@link TrainJunkModel} read the + * values here; both tools <b>refuse to start</b> if any CLI argument + * attempts to override a config-controlled parameter, surfacing the + * mistake at launch time rather than silently producing a non-canonical + * model. + * + * <p>The constants below reflect the choices that produced the current + * shipping model and are recorded in the corresponding training notes + * ({@code 20260514-junk-retrain-v6.md}). Update them by editing this + * file and committing the change together with the new model output. + * + * <p>The class has no instance state; all values are exposed as + * {@code public static final}. This keeps callsites short and avoids + * the temptation of passing a runtime-mutable config around. + * + * <p>This is not part of the public model-loading API. The {@link + * org.apache.tika.ml.junkdetect.JunkDetector} runtime is configuration- + * free; once a model file is built, all of its baked-in choices travel + * with the file's binary format. + */ +public final class JunkDetectorTrainingConfig { + + // ======================================================================= + // Corpus build (BuildJunkTrainingData) + // ======================================================================= + + /** + * Total UTF-8 byte budget across all script groups. Divided + * proportionally by per-script bigram entropy after the sampling phase. + */ + public static final long TOTAL_BUDGET_BYTES = 500_000_000L; + + /** + * Maximum UTF-8 bytes a single language may contribute to a + * multi-language script bucket. Prevents one large source (e.g. {@code + * zho} with 8 GB of MADLAD) from dominating a multi-language script + * model. Buckets with only one language ignore this cap and may consume + * their full budget. See {@link BuildJunkTrainingData} Phase 4. + */ + public static final long PER_LANGUAGE_CAP_BYTES = 5_000_000L; + + /** + * Sentence-level filter: minimum fraction of non-COMMON/INHERITED + * codepoints that must belong to the script bucket's target script for a + * sentence to be accepted. Set low so legitimate mixed-script content + * (Japanese kanji + kana, Korean with hanja annotations, Chinese with + * English citations, etc.) is preserved, but enough to reject lines that + * are essentially off-target (e.g. an English article about Gothic in + * the GOTHIC bucket). + */ + public static final double MIN_TARGET_SCRIPT_FRAC = 0.05; + + /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */ + public static final int MIN_BYTES_PER_SENTENCE = 50; + + /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */ + public static final double MAX_PUNC_FRAC = 0.30; + + /** + * Minimum number of sentences that must land in the dev split for a + * script to be included in the model. Scripts below this floor have + * insufficient data to reliably estimate calibration statistics, which + * inflates FPR. With {@code DEV_FRAC = 0.10} this corresponds to a + * total-sentence floor of {@code 500 / 0.10 = 5000} per script. + */ + public static final int MIN_DEV_SENTENCES = 500; + + /** Lines read per language to determine the language's dominant script. */ + public static final int SCRIPT_SAMPLE_LINES = 2_000; + + /** + * UTF-8 bytes loaded per script group for bigram entropy estimation, + * driving the entropy-proportional budget allocation. 200 KB is + * sufficient to characterise the bigram distribution of any single + * script. + */ + public static final long ENTROPY_SAMPLE_BYTES = 200_000L; + + /** Random seed for sentence shuffling and other corpus-build randomness. */ + public static final int SEED = 42; + + /** + * Script bucket names whose source data is too thin or too off-target + * to produce reliable per-script F1 calibration. Excluded from the + * model entirely; the {@link + * org.apache.tika.ml.junkdetect.JunkDetector#score(String)} routing + * falls back to "unknown script" behavior for these scripts. + * + * <p>The current selection is based on a corpus audit that found these + * scripts either had thin native source data (e.g. THAANA: 216 train + * sentences from Maldivian), or had sources dominated by off-target + * content (e.g. GOTHIC: 40% of lines are {@literal <}5% Gothic — the + * Wikipedia "gothic" directory is English text about Gothic). + * + * <p>Three further scripts (CANADIAN_ABORIGINAL, CHEROKEE, TIFINAGH) + * are not listed here because the {@link #MIN_TARGET_SCRIPT_FRAC} + * filter implicitly removes them — their MADLAD sources contain + * effectively no native-script content at the 5% threshold. Listing + * them here is unnecessary and would obscure the data-quality finding. + */ + public static final Set<String> DROP_SCRIPTS = + Collections.unmodifiableSet(new java.util.TreeSet<>(Set.of("GOTHIC", "THAANA"))); + + /** + * Per-script byte-budget overrides applied on top of the entropy- + * proportional allocation. Empty in the current configuration: an + * experiment that gave HAN 60 MB instead of the entropy-derived 26 MB + * <i>worsened</i> Cohen's d for every non-HAN script (the global F1 + * hash table is the bottleneck, not the corpus), so the override + * mechanism is preserved as infrastructure but is not currently used. + */ + public static final Map<String, Long> SCRIPT_BUDGET_OVERRIDES = + Collections.emptyMap(); + + // ======================================================================= + // Model train (TrainJunkModel) + // ======================================================================= + + /** + * Drop F1 bigrams whose global per-pair occurrence count is below this + * threshold from the codepoint-bigram hash table and Bloom filter. + * Set to 3 on evidence that singleton and doubleton pairs are + * overwhelmingly OCR artifacts and proper-noun noise that inflate the + * clean-side score distribution tail without contributing signal. + * + * <p>Set to 1 to disable the filter (legacy behavior). + */ + public static final int MIN_BIGRAM_COUNT = 3; + + /** + * Bloom filter capacity in bits for the F1 codepoint-bigram membership + * oracle. Must be a multiple of 64. 16 Mbit gives a comfortable false- + * positive rate at the current corpus's distinct-pair count. + */ + public static final int BLOOM_BITS = 16 * 1024 * 1024; + + private JunkDetectorTrainingConfig() { + // No instances. + } +} diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java index 2d95d7db5e..229c60a5c5 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java @@ -201,8 +201,20 @@ public class TrainJunkModel { Path dataDir = Paths.get(System.getProperty("user.home"), "datasets", "madlad", "junkdetect"); Path output = dataDir.resolve("junkdetect.bin"); - int bloomBits = V6_BLOOM_BITS_DEFAULT; - int minBigramCount = 1; + + // Durable training parameters live in JunkDetectorTrainingConfig; this + // tool deliberately refuses CLI overrides so a built model file's + // identity always matches a committed config. + int bloomBits = JunkDetectorTrainingConfig.BLOOM_BITS; + int minBigramCount = JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT; + if (bloomBits % 64 != 0) { + System.err.println("ERROR: BLOOM_BITS must be a multiple of 64"); + System.exit(1); + } + if (minBigramCount < 1) { + System.err.println("ERROR: MIN_BIGRAM_COUNT must be >= 1"); + System.exit(1); + } for (int i = 0; i < args.length; i++) { switch (args[i]) { @@ -213,18 +225,10 @@ public class TrainJunkModel { output = Paths.get(args[++i]); break; case "--bloom-bits": - bloomBits = Integer.parseInt(args[++i]); - if (bloomBits % 64 != 0) { - System.err.println("ERROR: --bloom-bits must be a multiple of 64"); - System.exit(1); - } - break; case "--min-bigram-count": - minBigramCount = Integer.parseInt(args[++i]); - if (minBigramCount < 1) { - System.err.println("ERROR: --min-bigram-count must be >= 1"); - System.exit(1); - } + System.err.println("ERROR: " + args[i] + " is no longer a CLI option." + + " Edit JunkDetectorTrainingConfig and commit the change instead."); + System.exit(1); break; default: System.err.println("Unknown argument: " + args[i]); @@ -236,12 +240,14 @@ public class TrainJunkModel { System.out.println("=== TrainJunkModel ==="); System.out.println(" data-dir: " + dataDir); System.out.println(" output: " + output); + System.out.println(" --- v6 format constants (TrainJunkModel) ---"); System.out.printf( " bigram_buckets: %d%n", V6_BIGRAM_BUCKETS); System.out.printf( " unigram_buckets: %d%n", V6_UNIGRAM_BUCKETS); - System.out.printf( " bloom_bits: %d (%d KB), k=%d%n", - bloomBits, bloomBits / 8 / 1024, V6_BLOOM_K); System.out.printf( " fnv_seed: 0x%08X%n", V6_FNV_SEED); System.out.printf( " backoff_alpha: %.2f%n", V6_BACKOFF_ALPHA); + System.out.println(" --- config (JunkDetectorTrainingConfig) ---"); + System.out.printf( " bloom_bits: %d (%d KB), k=%d%n", + bloomBits, bloomBits / 8 / 1024, V6_BLOOM_K); System.out.printf( " min_bigram_count: %d%n", minBigramCount); if (!Files.isDirectory(dataDir)) { @@ -1525,16 +1531,13 @@ public class TrainJunkModel { private static void printUsage() { System.err.println("Usage: TrainJunkModel [options]"); - System.err.println(" --data-dir <path> Directory with {script}.train.gz / .dev.gz files"); - System.err.println(" (default: ~/datasets/madlad/junkdetect)"); - System.err.println(" --output <path> Output model file"); - System.err.println(" (default: {data-dir}/junkdetect.bin)"); - System.err.println(" --bloom-bits <n> F1 Bloom filter size in bits (multiple of 64)"); - System.err.println(" --min-bigram-count <n> Drop F1 bigrams with global per-pair count < n."); - System.err.println(" n>=2 enables a pre-pass that tallies per-pair"); - System.err.println(" counts; rare bigrams (typically OCR/proper-noun"); - System.err.println(" noise) are excluded from the hash table and"); - System.err.println(" Bloom filter, cutting model size and FPR with"); - System.err.println(" negligible TPR impact. Default: 1 (no pruning)."); + System.err.println(" --data-dir <path> Directory with {script}.train.gz / .dev.gz files"); + System.err.println(" (default: ~/datasets/madlad/junkdetect)"); + System.err.println(" --output <path> Output model file"); + System.err.println(" (default: {data-dir}/junkdetect.bin)"); + System.err.println(); + System.err.println("All other training parameters (Bloom filter size, min bigram count, etc.)"); + System.err.println("are fixed in JunkDetectorTrainingConfig and tracked in git. Edit that"); + System.err.println("file and commit to change them."); } } diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java new file mode 100644 index 0000000000..a0f975eb46 --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect.tools; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Set; + +import org.junit.jupiter.api.Test; + +/** + * Pin-test for {@link JunkDetectorTrainingConfig}. + * + * <p>The values exercised here are the durable choices that define the + * shipping junk-detector model's identity. This test exists so that any + * change to those values requires updating an assertion in the same + * commit, surfacing the change in code review rather than letting it + * slip silently. + * + * <p>If you are intentionally tuning a parameter, update both the + * constant and the matching assertion below in the same change. Do not + * "fix" a failing assertion in isolation. + */ +class JunkDetectorTrainingConfigTest { + + @Test + void corpusBuildValues() { + assertEquals(500_000_000L, + JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES); + assertEquals(5_000_000L, + JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES); + assertEquals(0.05, + JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC, 1e-9); + assertEquals(50, + JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE); + assertEquals(0.30, + JunkDetectorTrainingConfig.MAX_PUNC_FRAC, 1e-9); + assertEquals(500, + JunkDetectorTrainingConfig.MIN_DEV_SENTENCES); + assertEquals(2_000, + JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES); + assertEquals(200_000L, + JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES); + assertEquals(42, + JunkDetectorTrainingConfig.SEED); + } + + @Test + void droppedScripts() { + Set<String> drop = JunkDetectorTrainingConfig.DROP_SCRIPTS; + assertEquals(Set.of("GOTHIC", "THAANA"), drop); + // Must be immutable: any caller that tries to mutate the set + // should fail loudly rather than corrupting the shared config. + assertThrows(UnsupportedOperationException.class, + () -> drop.add("FAKE")); + } + + @Test + void scriptBudgetOverridesEmptyByDefault() { + // We tried HAN=60MB; it lowered Cohen's d for every non-HAN script + // because the global F1 hash table is the bottleneck. Keep this + // map empty until v7 (per-script F1 tables) lands. + assertTrue(JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES.isEmpty()); + } + + @Test + void modelTrainValues() { + assertEquals(3, JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT); + assertEquals(16 * 1024 * 1024, JunkDetectorTrainingConfig.BLOOM_BITS); + assertEquals(0, JunkDetectorTrainingConfig.BLOOM_BITS % 64, + "BLOOM_BITS must be a multiple of 64"); + } + + @Test + void notInstantiable() { + // The class is a frozen configuration container; making it + // instantiable would invite per-call mutation. + java.lang.reflect.Constructor<?>[] ctors = + JunkDetectorTrainingConfig.class.getDeclaredConstructors(); + assertEquals(1, ctors.length, "expected exactly one constructor"); + assertFalse(java.lang.reflect.Modifier.isPublic(ctors[0].getModifiers()), + "constructor should not be public"); + } +}
