(tika) 05/06: junk-detector: move training choices into JunkDetectorTrainingConfig

tallison Thu, 14 May 2026 11:45:58 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch junk-detector-v6
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 8c91c28a5885f445201a97de6d075a1fc28b3e0d
Author: tballison <[email protected]>
AuthorDate: Thu May 14 13:42:03 2026 -0400

    junk-detector: move training choices into JunkDetectorTrainingConfig
    
    Replaces per-tool CLI flags for durable training/build parameters with a
    single committed config class.  CLI surface of the two tools shrinks to
    data-dir, output(-dir), and (for BuildJunkTrainingData) --dry-run.  Any
    attempt to pass a now-removed flag like --total-budget-bytes or
    --min-bigram-count is rejected with a pointer to the config file.
    
    Rationale: we've repeatedly burned cycles asking "wait, which run was
    that?" when a model file's identity depended on shell history rather
    than tracked source.  With this change every parameter that affects the
    model lives in code that's reviewed and grep-able from a commit hash.
    
    The config values pin the current shipping setup: 500 MB total budget
    with a 5 MB per-language cap, 5% target-script-fraction line filter,
    GOTHIC and THAANA dropped, min_bigram_count = 3, 16 Mbit Bloom.  These
    together produce macro Cohen's d = 12.11 / FPR = 0.004 / TPR = 0.894 on
    the dev split (vs. honest v6 baseline of 9.81 / 0.017 / 0.865).
    
    The smoke-rerun produced a model file whose MD5 matches the prior CLI-
    flag-driven v2 model byte-for-byte; the refactor is provably behavior-
    preserving.
    
    Format-tied constants (V6_BIGRAM_BUCKETS, V6_FNV_SEED, etc.) stay in
    TrainJunkModel — they're part of the v6 binary protocol, not tunable
    training choices, and moving them would muddy the distinction.
    
    Test JunkDetectorTrainingConfigTest pins the current values so any
    future change has to land alongside an explicit assertion update.
    
    29 tests pass (24 previous + 5 new).
    
    Co-authored-by: Cursor <[email protected]>
---
 .../ml/junkdetect/tools/BuildJunkTrainingData.java | 195 ++++++++-------------
 .../tools/JunkDetectorTrainingConfig.java          | 179 +++++++++++++++++++
 .../tika/ml/junkdetect/tools/TrainJunkModel.java   |  55 +++---
 .../tools/JunkDetectorTrainingConfigTest.java      | 101 +++++++++++
 4 files changed, 381 insertions(+), 149 deletions(-)

diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index 26ba0b9732..a80fafbd6b 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -82,66 +82,18 @@ import java.util.zip.GZIPOutputStream;
 public class BuildJunkTrainingData {
 
     // -----------------------------------------------------------------------
-    // Defaults
+    // Split ratios — fixed, part of the model identity (changing them would
+    // invalidate downstream eval comparisons).
     // -----------------------------------------------------------------------
 
-    /** Lines read per language to determine dominant script. */
-    private static final int DEFAULT_SCRIPT_SAMPLE_LINES = 2_000;
-
-    /**
-     * UTF-8 bytes loaded per script group for entropy estimation.
-     * Budget is spread evenly across languages in the group.
-     * 200KB is enough to observe the bigram distribution reliably.
-     */
-    private static final long ENTROPY_SAMPLE_BYTES = 200_000L;
-
-    /**
-     * Total UTF-8 byte budget across all script groups.  Divided 
proportionally
-     * by bigram entropy after the sampling phase.  50MB gives ~1–3MB per 
script
-     * on average across 34 groups; scale up for production runs.
-     */
-    private static final long DEFAULT_TOTAL_BUDGET_BYTES = 50_000_000L;
-
-    /**
-     * Maximum UTF-8 bytes any single language may contribute to its script
-     * bucket.  Prevents one language (e.g. {@code zho} with 8 GB of MADLAD)
-     * from dominating a multi-language script.  Languages with less than this
-     * available take what they have; languages above the cap get truncated.
-     * Default {@code 5 MB} balances diversity against per-language coverage.
-     */
-    private static final long DEFAULT_PER_LANGUAGE_CAP_BYTES = 5_000_000L;
-
-    /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */
-    private static final int DEFAULT_MIN_BYTES = 50;
-
-    /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */
-    private static final double DEFAULT_MAX_PUNC_FRAC = 0.30;
-
-    /**
-     * Minimum fraction of a sentence's non-COMMON/INHERITED codepoints that
-     * must belong to the script bucket's target script for the sentence to be
-     * accepted.  Lines whose target-script fraction falls below this floor are
-     * dropped — typically these are off-target Wikipedia stubs (e.g. an 
article
-     * about Gothic written almost entirely in English).  Set very low by
-     * default so that legitimate mixed-script content (Japanese with kanji +
-     * kana, Korean with hanja annotations, Chinese with English citations) is
-     * preserved.
-     */
-    private static final double DEFAULT_MIN_TARGET_SCRIPT_FRAC = 0.05;
-
     /** Fraction of sentences written to each split (train / dev / test = 
80/10/10). */
     private static final double TRAIN_FRAC = 0.80;
     private static final double DEV_FRAC   = 0.10;
     // remaining (1 - TRAIN_FRAC - DEV_FRAC) goes to the test split
 
-    /**
-     * Minimum number of sentences that must land in the dev split for a 
script to be
-     * included in the model.  Scripts below this floor have too few samples 
to reliably
-     * estimate calibration statistics (mu/sigma), which produces noisy 
z-scores and
-     * inflated false positive rates.  With DEV_FRAC=0.10 the effective 
minimum total
-     * sentence count is minDevSentences / DEV_FRAC (default: 5,000 total 
sentences).
-     */
-    private static final int DEFAULT_MIN_DEV_SENTENCES = 500;
+    // All other durable parameters live in JunkDetectorTrainingConfig.  This
+    // tool deliberately does not accept CLI overrides for those values; see
+    // the rejection logic in main() below.
 
     // -----------------------------------------------------------------------
     // Entry point
@@ -150,16 +102,22 @@ public class BuildJunkTrainingData {
     public static void main(String[] args) throws IOException {
         Path dataDir = Paths.get(System.getProperty("user.home"), "datasets", 
"madlad", "data");
         Path outputDir = Paths.get(System.getProperty("user.home"), 
"datasets", "madlad", "junkdetect");
-        int scriptSampleLines = DEFAULT_SCRIPT_SAMPLE_LINES;
-        long totalBudgetBytes = DEFAULT_TOTAL_BUDGET_BYTES;
-        long perLanguageCapBytes = DEFAULT_PER_LANGUAGE_CAP_BYTES;
-        int minBytes = DEFAULT_MIN_BYTES;
-        double maxPuncFrac = DEFAULT_MAX_PUNC_FRAC;
-        double minTargetScriptFrac = DEFAULT_MIN_TARGET_SCRIPT_FRAC;
-        int seed = 42;
         boolean dryRun = false;
-        int minDevSentences = DEFAULT_MIN_DEV_SENTENCES;
-        java.util.Set<String> dropScripts = new java.util.HashSet<>();
+
+        // Bind config-controlled values into local variables.  These are
+        // read-only from this point on; any attempt to override them via CLI
+        // is rejected below.
+        long totalBudgetBytes = JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES;
+        long perLanguageCapBytes = 
JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES;
+        int minBytes = JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE;
+        double maxPuncFrac = JunkDetectorTrainingConfig.MAX_PUNC_FRAC;
+        double minTargetScriptFrac = 
JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC;
+        int minDevSentences = JunkDetectorTrainingConfig.MIN_DEV_SENTENCES;
+        int scriptSampleLines = JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES;
+        int seed = JunkDetectorTrainingConfig.SEED;
+        java.util.Set<String> dropScripts = 
JunkDetectorTrainingConfig.DROP_SCRIPTS;
+        Map<String, Long> scriptBudgetOverrides =
+                JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES;
 
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
@@ -169,38 +127,25 @@ public class BuildJunkTrainingData {
                 case "--output-dir":
                     outputDir = Paths.get(args[++i]);
                     break;
-                case "--script-sample-lines":
-                    scriptSampleLines = Integer.parseInt(args[++i]);
+                case "--dry-run":
+                    dryRun = true;
                     break;
+                // Durable parameters are config-controlled.  Refuse any CLI
+                // override so that a model file's identity always matches the
+                // committed config.
+                case "--script-sample-lines":
                 case "--total-budget-bytes":
-                    totalBudgetBytes = Long.parseLong(args[++i]);
-                    break;
                 case "--per-language-cap-bytes":
-                    perLanguageCapBytes = Long.parseLong(args[++i]);
-                    break;
                 case "--min-bytes":
-                    minBytes = Integer.parseInt(args[++i]);
-                    break;
                 case "--max-punc-frac":
-                    maxPuncFrac = Double.parseDouble(args[++i]);
-                    break;
                 case "--min-target-script-frac":
-                    minTargetScriptFrac = Double.parseDouble(args[++i]);
-                    break;
                 case "--seed":
-                    seed = Integer.parseInt(args[++i]);
-                    break;
                 case "--min-dev-sentences":
-                    minDevSentences = Integer.parseInt(args[++i]);
-                    break;
                 case "--drop-scripts":
-                    for (String s : args[++i].split(",")) {
-                        String t = s.trim().toUpperCase();
-                        if (!t.isEmpty()) dropScripts.add(t);
-                    }
-                    break;
-                case "--dry-run":
-                    dryRun = true;
+                case "--script-budget-override":
+                    System.err.println("ERROR: " + args[i] + " is no longer a 
CLI option."
+                            + "  Edit JunkDetectorTrainingConfig and commit 
the change instead.");
+                    System.exit(1);
                     break;
                 default:
                     System.err.println("Unknown argument: " + args[i]);
@@ -210,21 +155,26 @@ public class BuildJunkTrainingData {
         }
 
         System.out.println("=== BuildJunkTrainingData ===");
-        System.out.println("  data-dir:           " + dataDir);
-        System.out.println("  output-dir:         " + outputDir);
-        System.out.printf( "  total-budget-bytes:    %,d (%.1f MB)%n",
+        System.out.println("  data-dir:               " + dataDir);
+        System.out.println("  output-dir:             " + outputDir);
+        System.out.println("  --- config (JunkDetectorTrainingConfig) ---");
+        System.out.printf( "  total-budget-bytes:     %,d (%.1f MB)%n",
                 totalBudgetBytes, totalBudgetBytes / 1_000_000.0);
-        System.out.printf( "  per-language-cap:      %,d (%.1f MB)%n",
+        System.out.printf( "  per-language-cap:       %,d (%.1f MB)%n",
                 perLanguageCapBytes, perLanguageCapBytes / 1_000_000.0);
-        System.out.printf( "  min-bytes:             %d%n", minBytes);
-        System.out.printf( "  max-punc-frac:         %.2f%n", maxPuncFrac);
+        System.out.printf( "  min-bytes:              %d%n", minBytes);
+        System.out.printf( "  max-punc-frac:          %.2f%n", maxPuncFrac);
         System.out.printf( "  min-target-script-frac: %.2f%n", 
minTargetScriptFrac);
-        System.out.printf( "  min-dev-sentences:     %d  (min total ≈ %d)%n",
+        System.out.printf( "  min-dev-sentences:      %d  (min total ≈ %d)%n",
                 minDevSentences, (int)(minDevSentences / DEV_FRAC));
+        System.out.printf( "  seed:                   %d%n", seed);
         if (!dropScripts.isEmpty()) {
-            System.out.println("  drop-scripts:          " + dropScripts);
+            System.out.println("  drop-scripts:           " + dropScripts);
         }
-        System.out.println("  dry-run:               " + dryRun);
+        if (!scriptBudgetOverrides.isEmpty()) {
+            System.out.println("  script-budget-override: " + 
scriptBudgetOverrides);
+        }
+        System.out.println("  dry-run:                " + dryRun);
 
         if (!Files.isDirectory(dataDir)) {
             System.err.println("ERROR: data-dir not found: " + dataDir);
@@ -273,7 +223,8 @@ public class BuildJunkTrainingData {
             String script = entry.getKey();
             List<Path> langDirs = entry.getValue();
 
-            long perLangSampleBytes = Math.max(ENTROPY_SAMPLE_BYTES / 
langDirs.size(), 2_000L);
+            long perLangSampleBytes = Math.max(
+                    JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES / 
langDirs.size(), 2_000L);
             List<String> sample = new ArrayList<>();
             for (Path langDir : langDirs) {
                 loadSentences(langDir, perLangSampleBytes, minBytes, 
maxPuncFrac, sample);
@@ -297,9 +248,25 @@ public class BuildJunkTrainingData {
         Map<String, Long> scriptBudget = new TreeMap<>();
         for (Map.Entry<String, Double> e : scriptEntropy.entrySet()) {
             long budget = (long) (totalBudgetBytes * e.getValue() / 
totalEntropy);
+            Long override = scriptBudgetOverrides.get(e.getKey());
+            if (override != null) {
+                System.out.printf("  %-20s H=%.3f → %,d bytes (%.1f MB)"
+                        + "  [OVERRIDE: was %,d (%.1f MB)]%n",
+                        e.getKey(), e.getValue(), override, override / 
1_000_000.0,
+                        budget, budget / 1_000_000.0);
+                budget = override;
+            } else {
+                System.out.printf("  %-20s H=%.3f → %,d bytes (%.1f MB)%n",
+                        e.getKey(), e.getValue(), budget, budget / 
1_000_000.0);
+            }
             scriptBudget.put(e.getKey(), budget);
-            System.out.printf("  %-20s H=%.3f → %,d bytes (%.1f MB)%n",
-                    e.getKey(), e.getValue(), budget, budget / 1_000_000.0);
+        }
+        // Warn about overrides for scripts that aren't in the bucket set.
+        for (String k : scriptBudgetOverrides.keySet()) {
+            if (!scriptBudget.containsKey(k)) {
+                System.err.printf("WARNING: --script-budget-override for %s 
ignored"
+                        + " (script not in bucket set)%n", k);
+            }
         }
 
         if (dryRun) {
@@ -752,33 +719,15 @@ public class BuildJunkTrainingData {
 
     private static void printUsage() {
         System.err.println("Usage: BuildJunkTrainingData [options]");
-        System.err.println("  --data-dir               <path>  MADLAD data 
root"
+        System.err.println("  --data-dir   <path>  MADLAD data root"
                 + " (default: ~/datasets/madlad/data)");
-        System.err.println("  --output-dir             <path>  Output 
directory"
+        System.err.println("  --output-dir <path>  Output directory"
                 + " (default: ~/datasets/madlad/junkdetect)");
-        System.err.println("  --script-sample-lines    N       Lines per 
language for script"
-                + " detection (default: 2000)");
-        System.err.println("  --total-budget-bytes     N       Total UTF-8 
bytes across all"
-                + " scripts (default: 50000000)");
-        System.err.println("  --per-language-cap-bytes N       Max UTF-8 bytes 
contributed by any"
-                + " single language to its script bucket (default: 5000000).  
Prevents one large"
-                + " language source from dominating a multi-language bucket.");
-        System.err.println("  --min-bytes              N       Min UTF-8 bytes 
per sentence"
-                + " (default: 50)");
-        System.err.println("  --max-punc-frac          F       Max ASCII punct 
fraction"
-                + " (default: 0.30)");
-        System.err.println("  --min-target-script-frac F       Min fraction of 
non-COMMON cps that"
-                + " must be in the bucket's target script for a sentence to be 
kept"
-                + " (default: 0.05).  Filters off-target Wikipedia stubs (e.g. 
English-about-Gothic"
-                + " articles in the GOTHIC bucket).");
-        System.err.println("  --min-dev-sentences      N       Min sentences 
in dev split for a"
-                + " script to be included (default: 500). Scripts below this 
floor"
-                + " have unreliable calibration and inflated FPR.");
-        System.err.println("  --drop-scripts           S,S,..  Comma-separated 
script bucket names"
-                + " to exclude (e.g. GOTHIC,THAANA).  Use when source data is 
too thin or off-"
-                + " target for reliable distribution estimates.");
-        System.err.println("  --seed                   N       Random seed 
(default: 42)");
-        System.err.println("  --dry-run                        Detect scripts 
+ show budget,"
-                + " skip file writing");
+        System.err.println("  --dry-run            Detect scripts + show 
budget,"
+                + " skip file writing.");
+        System.err.println();
+        System.err.println("All other training/build parameters (budgets, 
filters, dropped"
+                + " scripts, seed, etc.) are fixed in 
JunkDetectorTrainingConfig and tracked"
+                + " in git.  Edit that file and commit to change them.");
     }
 }
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
new file mode 100644
index 0000000000..15600a96c6
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfig.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Frozen set of training-time choices that together define a junk-detector
+ * model's identity.  Any change to these values produces a meaningfully
+ * different model and must be reviewed in git.
+ *
+ * <p>Two principles drove making this a class rather than CLI flags:
+ *
+ * <ol>
+ *   <li><b>Reproducibility.</b>  When we look back at a model file six
+ *       months later we want a single commit hash that says exactly what
+ *       knobs produced it, not a half-remembered shell history.
+ *   <li><b>Drift prevention.</b>  CLI flags with defaults allow accidental
+ *       deviation between developers ("did you remember to pass
+ *       {@code --min-target-script-frac 0.05}?").  Constants in a tracked
+ *       file remove that failure mode.
+ * </ol>
+ *
+ * <p>{@link BuildJunkTrainingData} and {@link TrainJunkModel} read the
+ * values here; both tools <b>refuse to start</b> if any CLI argument
+ * attempts to override a config-controlled parameter, surfacing the
+ * mistake at launch time rather than silently producing a non-canonical
+ * model.
+ *
+ * <p>The constants below reflect the choices that produced the current
+ * shipping model and are recorded in the corresponding training notes
+ * ({@code 20260514-junk-retrain-v6.md}).  Update them by editing this
+ * file and committing the change together with the new model output.
+ *
+ * <p>The class has no instance state; all values are exposed as
+ * {@code public static final}.  This keeps callsites short and avoids
+ * the temptation of passing a runtime-mutable config around.
+ *
+ * <p>This is not part of the public model-loading API.  The {@link
+ * org.apache.tika.ml.junkdetect.JunkDetector} runtime is configuration-
+ * free; once a model file is built, all of its baked-in choices travel
+ * with the file's binary format.
+ */
+public final class JunkDetectorTrainingConfig {
+
+    // =======================================================================
+    // Corpus build (BuildJunkTrainingData)
+    // =======================================================================
+
+    /**
+     * Total UTF-8 byte budget across all script groups.  Divided
+     * proportionally by per-script bigram entropy after the sampling phase.
+     */
+    public static final long TOTAL_BUDGET_BYTES = 500_000_000L;
+
+    /**
+     * Maximum UTF-8 bytes a single language may contribute to a
+     * multi-language script bucket.  Prevents one large source (e.g. {@code
+     * zho} with 8 GB of MADLAD) from dominating a multi-language script
+     * model.  Buckets with only one language ignore this cap and may consume
+     * their full budget.  See {@link BuildJunkTrainingData} Phase 4.
+     */
+    public static final long PER_LANGUAGE_CAP_BYTES = 5_000_000L;
+
+    /**
+     * Sentence-level filter: minimum fraction of non-COMMON/INHERITED
+     * codepoints that must belong to the script bucket's target script for a
+     * sentence to be accepted.  Set low so legitimate mixed-script content
+     * (Japanese kanji + kana, Korean with hanja annotations, Chinese with
+     * English citations, etc.) is preserved, but enough to reject lines that
+     * are essentially off-target (e.g. an English article about Gothic in
+     * the GOTHIC bucket).
+     */
+    public static final double MIN_TARGET_SCRIPT_FRAC = 0.05;
+
+    /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */
+    public static final int MIN_BYTES_PER_SENTENCE = 50;
+
+    /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */
+    public static final double MAX_PUNC_FRAC = 0.30;
+
+    /**
+     * Minimum number of sentences that must land in the dev split for a
+     * script to be included in the model.  Scripts below this floor have
+     * insufficient data to reliably estimate calibration statistics, which
+     * inflates FPR.  With {@code DEV_FRAC = 0.10} this corresponds to a
+     * total-sentence floor of {@code 500 / 0.10 = 5000} per script.
+     */
+    public static final int MIN_DEV_SENTENCES = 500;
+
+    /** Lines read per language to determine the language's dominant script. */
+    public static final int SCRIPT_SAMPLE_LINES = 2_000;
+
+    /**
+     * UTF-8 bytes loaded per script group for bigram entropy estimation,
+     * driving the entropy-proportional budget allocation.  200 KB is
+     * sufficient to characterise the bigram distribution of any single
+     * script.
+     */
+    public static final long ENTROPY_SAMPLE_BYTES = 200_000L;
+
+    /** Random seed for sentence shuffling and other corpus-build randomness. 
*/
+    public static final int SEED = 42;
+
+    /**
+     * Script bucket names whose source data is too thin or too off-target
+     * to produce reliable per-script F1 calibration.  Excluded from the
+     * model entirely; the {@link
+     * org.apache.tika.ml.junkdetect.JunkDetector#score(String)} routing
+     * falls back to "unknown script" behavior for these scripts.
+     *
+     * <p>The current selection is based on a corpus audit that found these
+     * scripts either had thin native source data (e.g. THAANA: 216 train
+     * sentences from Maldivian), or had sources dominated by off-target
+     * content (e.g. GOTHIC: 40% of lines are {@literal <}5% Gothic — the
+     * Wikipedia "gothic" directory is English text about Gothic).
+     *
+     * <p>Three further scripts (CANADIAN_ABORIGINAL, CHEROKEE, TIFINAGH)
+     * are not listed here because the {@link #MIN_TARGET_SCRIPT_FRAC}
+     * filter implicitly removes them — their MADLAD sources contain
+     * effectively no native-script content at the 5% threshold.  Listing
+     * them here is unnecessary and would obscure the data-quality finding.
+     */
+    public static final Set<String> DROP_SCRIPTS =
+            Collections.unmodifiableSet(new 
java.util.TreeSet<>(Set.of("GOTHIC", "THAANA")));
+
+    /**
+     * Per-script byte-budget overrides applied on top of the entropy-
+     * proportional allocation.  Empty in the current configuration: an
+     * experiment that gave HAN 60 MB instead of the entropy-derived 26 MB
+     * <i>worsened</i> Cohen's d for every non-HAN script (the global F1
+     * hash table is the bottleneck, not the corpus), so the override
+     * mechanism is preserved as infrastructure but is not currently used.
+     */
+    public static final Map<String, Long> SCRIPT_BUDGET_OVERRIDES =
+            Collections.emptyMap();
+
+    // =======================================================================
+    // Model train (TrainJunkModel)
+    // =======================================================================
+
+    /**
+     * Drop F1 bigrams whose global per-pair occurrence count is below this
+     * threshold from the codepoint-bigram hash table and Bloom filter.
+     * Set to 3 on evidence that singleton and doubleton pairs are
+     * overwhelmingly OCR artifacts and proper-noun noise that inflate the
+     * clean-side score distribution tail without contributing signal.
+     *
+     * <p>Set to 1 to disable the filter (legacy behavior).
+     */
+    public static final int MIN_BIGRAM_COUNT = 3;
+
+    /**
+     * Bloom filter capacity in bits for the F1 codepoint-bigram membership
+     * oracle.  Must be a multiple of 64.  16 Mbit gives a comfortable false-
+     * positive rate at the current corpus's distinct-pair count.
+     */
+    public static final int BLOOM_BITS = 16 * 1024 * 1024;
+
+    private JunkDetectorTrainingConfig() {
+        // No instances.
+    }
+}
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index 2d95d7db5e..229c60a5c5 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -201,8 +201,20 @@ public class TrainJunkModel {
         Path dataDir = Paths.get(System.getProperty("user.home"),
                 "datasets", "madlad", "junkdetect");
         Path output = dataDir.resolve("junkdetect.bin");
-        int bloomBits = V6_BLOOM_BITS_DEFAULT;
-        int minBigramCount = 1;
+
+        // Durable training parameters live in JunkDetectorTrainingConfig; this
+        // tool deliberately refuses CLI overrides so a built model file's
+        // identity always matches a committed config.
+        int bloomBits = JunkDetectorTrainingConfig.BLOOM_BITS;
+        int minBigramCount = JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT;
+        if (bloomBits % 64 != 0) {
+            System.err.println("ERROR: BLOOM_BITS must be a multiple of 64");
+            System.exit(1);
+        }
+        if (minBigramCount < 1) {
+            System.err.println("ERROR: MIN_BIGRAM_COUNT must be >= 1");
+            System.exit(1);
+        }
 
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
@@ -213,18 +225,10 @@ public class TrainJunkModel {
                     output = Paths.get(args[++i]);
                     break;
                 case "--bloom-bits":
-                    bloomBits = Integer.parseInt(args[++i]);
-                    if (bloomBits % 64 != 0) {
-                        System.err.println("ERROR: --bloom-bits must be a 
multiple of 64");
-                        System.exit(1);
-                    }
-                    break;
                 case "--min-bigram-count":
-                    minBigramCount = Integer.parseInt(args[++i]);
-                    if (minBigramCount < 1) {
-                        System.err.println("ERROR: --min-bigram-count must be 
>= 1");
-                        System.exit(1);
-                    }
+                    System.err.println("ERROR: " + args[i] + " is no longer a 
CLI option."
+                            + "  Edit JunkDetectorTrainingConfig and commit 
the change instead.");
+                    System.exit(1);
                     break;
                 default:
                     System.err.println("Unknown argument: " + args[i]);
@@ -236,12 +240,14 @@ public class TrainJunkModel {
         System.out.println("=== TrainJunkModel ===");
         System.out.println("  data-dir:           " + dataDir);
         System.out.println("  output:             " + output);
+        System.out.println("  --- v6 format constants (TrainJunkModel) ---");
         System.out.printf( "  bigram_buckets:     %d%n", V6_BIGRAM_BUCKETS);
         System.out.printf( "  unigram_buckets:    %d%n", V6_UNIGRAM_BUCKETS);
-        System.out.printf( "  bloom_bits:         %d (%d KB), k=%d%n",
-                bloomBits, bloomBits / 8 / 1024, V6_BLOOM_K);
         System.out.printf( "  fnv_seed:           0x%08X%n", V6_FNV_SEED);
         System.out.printf( "  backoff_alpha:      %.2f%n", V6_BACKOFF_ALPHA);
+        System.out.println("  --- config (JunkDetectorTrainingConfig) ---");
+        System.out.printf( "  bloom_bits:         %d (%d KB), k=%d%n",
+                bloomBits, bloomBits / 8 / 1024, V6_BLOOM_K);
         System.out.printf( "  min_bigram_count:   %d%n", minBigramCount);
 
         if (!Files.isDirectory(dataDir)) {
@@ -1525,16 +1531,13 @@ public class TrainJunkModel {
 
     private static void printUsage() {
         System.err.println("Usage: TrainJunkModel [options]");
-        System.err.println("  --data-dir <path>         Directory with 
{script}.train.gz / .dev.gz files");
-        System.err.println("                            (default: 
~/datasets/madlad/junkdetect)");
-        System.err.println("  --output   <path>         Output model file");
-        System.err.println("                            (default: 
{data-dir}/junkdetect.bin)");
-        System.err.println("  --bloom-bits <n>          F1 Bloom filter size 
in bits (multiple of 64)");
-        System.err.println("  --min-bigram-count <n>    Drop F1 bigrams with 
global per-pair count < n.");
-        System.err.println("                            n>=2 enables a 
pre-pass that tallies per-pair");
-        System.err.println("                            counts; rare bigrams 
(typically OCR/proper-noun");
-        System.err.println("                            noise) are excluded 
from the hash table and");
-        System.err.println("                            Bloom filter, cutting 
model size and FPR with");
-        System.err.println("                            negligible TPR impact. 
 Default: 1 (no pruning).");
+        System.err.println("  --data-dir <path>  Directory with 
{script}.train.gz / .dev.gz files");
+        System.err.println("                     (default: 
~/datasets/madlad/junkdetect)");
+        System.err.println("  --output   <path>  Output model file");
+        System.err.println("                     (default: 
{data-dir}/junkdetect.bin)");
+        System.err.println();
+        System.err.println("All other training parameters (Bloom filter size, 
min bigram count, etc.)");
+        System.err.println("are fixed in JunkDetectorTrainingConfig and 
tracked in git.  Edit that");
+        System.err.println("file and commit to change them.");
     }
 }
diff --git 
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
new file mode 100644
index 0000000000..a0f975eb46
--- /dev/null
+++ 
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/tools/JunkDetectorTrainingConfigTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect.tools;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Set;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Pin-test for {@link JunkDetectorTrainingConfig}.
+ *
+ * <p>The values exercised here are the durable choices that define the
+ * shipping junk-detector model's identity.  This test exists so that any
+ * change to those values requires updating an assertion in the same
+ * commit, surfacing the change in code review rather than letting it
+ * slip silently.
+ *
+ * <p>If you are intentionally tuning a parameter, update both the
+ * constant and the matching assertion below in the same change.  Do not
+ * "fix" a failing assertion in isolation.
+ */
+class JunkDetectorTrainingConfigTest {
+
+    @Test
+    void corpusBuildValues() {
+        assertEquals(500_000_000L,
+                JunkDetectorTrainingConfig.TOTAL_BUDGET_BYTES);
+        assertEquals(5_000_000L,
+                JunkDetectorTrainingConfig.PER_LANGUAGE_CAP_BYTES);
+        assertEquals(0.05,
+                JunkDetectorTrainingConfig.MIN_TARGET_SCRIPT_FRAC, 1e-9);
+        assertEquals(50,
+                JunkDetectorTrainingConfig.MIN_BYTES_PER_SENTENCE);
+        assertEquals(0.30,
+                JunkDetectorTrainingConfig.MAX_PUNC_FRAC, 1e-9);
+        assertEquals(500,
+                JunkDetectorTrainingConfig.MIN_DEV_SENTENCES);
+        assertEquals(2_000,
+                JunkDetectorTrainingConfig.SCRIPT_SAMPLE_LINES);
+        assertEquals(200_000L,
+                JunkDetectorTrainingConfig.ENTROPY_SAMPLE_BYTES);
+        assertEquals(42,
+                JunkDetectorTrainingConfig.SEED);
+    }
+
+    @Test
+    void droppedScripts() {
+        Set<String> drop = JunkDetectorTrainingConfig.DROP_SCRIPTS;
+        assertEquals(Set.of("GOTHIC", "THAANA"), drop);
+        // Must be immutable: any caller that tries to mutate the set
+        // should fail loudly rather than corrupting the shared config.
+        assertThrows(UnsupportedOperationException.class,
+                () -> drop.add("FAKE"));
+    }
+
+    @Test
+    void scriptBudgetOverridesEmptyByDefault() {
+        // We tried HAN=60MB; it lowered Cohen's d for every non-HAN script
+        // because the global F1 hash table is the bottleneck.  Keep this
+        // map empty until v7 (per-script F1 tables) lands.
+        
assertTrue(JunkDetectorTrainingConfig.SCRIPT_BUDGET_OVERRIDES.isEmpty());
+    }
+
+    @Test
+    void modelTrainValues() {
+        assertEquals(3, JunkDetectorTrainingConfig.MIN_BIGRAM_COUNT);
+        assertEquals(16 * 1024 * 1024, JunkDetectorTrainingConfig.BLOOM_BITS);
+        assertEquals(0, JunkDetectorTrainingConfig.BLOOM_BITS % 64,
+                "BLOOM_BITS must be a multiple of 64");
+    }
+
+    @Test
+    void notInstantiable() {
+        // The class is a frozen configuration container; making it
+        // instantiable would invite per-call mutation.
+        java.lang.reflect.Constructor<?>[] ctors =
+                JunkDetectorTrainingConfig.class.getDeclaredConstructors();
+        assertEquals(1, ctors.length, "expected exactly one constructor");
+        
assertFalse(java.lang.reflect.Modifier.isPublic(ctors[0].getModifiers()),
+                "constructor should not be public");
+    }
+}

(tika) 05/06: junk-detector: move training choices into JunkDetectorTrainingConfig

Reply via email to