This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch junk-detector-v6
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a24d53259ebbf0cfc9f7ceaf365fc78a7348bbe4
Author: tballison <[email protected]>
AuthorDate: Thu May 14 13:24:47 2026 -0400

    checkpoint
---
 .../ml/junkdetect/tools/BuildJunkTrainingData.java | 176 ++++++++++++++++++---
 1 file changed, 157 insertions(+), 19 deletions(-)

diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
index 27a5436d5e..26ba0b9732 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/BuildJunkTrainingData.java
@@ -102,12 +102,33 @@ public class BuildJunkTrainingData {
      */
     private static final long DEFAULT_TOTAL_BUDGET_BYTES = 50_000_000L;
 
+    /**
+     * Maximum UTF-8 bytes any single language may contribute to its script
+     * bucket.  Prevents one language (e.g. {@code zho} with 8 GB of MADLAD)
+     * from dominating a multi-language script.  Languages with less than this
+     * available take what they have; languages above the cap get truncated.
+     * Default {@code 5 MB} balances diversity against per-language coverage.
+     */
+    private static final long DEFAULT_PER_LANGUAGE_CAP_BYTES = 5_000_000L;
+
     /** Minimum UTF-8 byte length for a sentence to pass the quality filter. */
     private static final int DEFAULT_MIN_BYTES = 50;
 
     /** Maximum fraction of codepoints that may be ASCII punctuation/digits. */
     private static final double DEFAULT_MAX_PUNC_FRAC = 0.30;
 
+    /**
+     * Minimum fraction of a sentence's non-COMMON/INHERITED codepoints that
+     * must belong to the script bucket's target script for the sentence to be
+     * accepted.  Lines whose target-script fraction falls below this floor are
+     * dropped — typically these are off-target Wikipedia stubs (e.g. an 
article
+     * about Gothic written almost entirely in English).  Set very low by
+     * default so that legitimate mixed-script content (Japanese with kanji +
+     * kana, Korean with hanja annotations, Chinese with English citations) is
+     * preserved.
+     */
+    private static final double DEFAULT_MIN_TARGET_SCRIPT_FRAC = 0.05;
+
     /** Fraction of sentences written to each split (train / dev / test = 
80/10/10). */
     private static final double TRAIN_FRAC = 0.80;
     private static final double DEV_FRAC   = 0.10;
@@ -131,11 +152,14 @@ public class BuildJunkTrainingData {
         Path outputDir = Paths.get(System.getProperty("user.home"), 
"datasets", "madlad", "junkdetect");
         int scriptSampleLines = DEFAULT_SCRIPT_SAMPLE_LINES;
         long totalBudgetBytes = DEFAULT_TOTAL_BUDGET_BYTES;
+        long perLanguageCapBytes = DEFAULT_PER_LANGUAGE_CAP_BYTES;
         int minBytes = DEFAULT_MIN_BYTES;
         double maxPuncFrac = DEFAULT_MAX_PUNC_FRAC;
+        double minTargetScriptFrac = DEFAULT_MIN_TARGET_SCRIPT_FRAC;
         int seed = 42;
         boolean dryRun = false;
         int minDevSentences = DEFAULT_MIN_DEV_SENTENCES;
+        java.util.Set<String> dropScripts = new java.util.HashSet<>();
 
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
@@ -151,18 +175,30 @@ public class BuildJunkTrainingData {
                 case "--total-budget-bytes":
                     totalBudgetBytes = Long.parseLong(args[++i]);
                     break;
+                case "--per-language-cap-bytes":
+                    perLanguageCapBytes = Long.parseLong(args[++i]);
+                    break;
                 case "--min-bytes":
                     minBytes = Integer.parseInt(args[++i]);
                     break;
                 case "--max-punc-frac":
                     maxPuncFrac = Double.parseDouble(args[++i]);
                     break;
+                case "--min-target-script-frac":
+                    minTargetScriptFrac = Double.parseDouble(args[++i]);
+                    break;
                 case "--seed":
                     seed = Integer.parseInt(args[++i]);
                     break;
                 case "--min-dev-sentences":
                     minDevSentences = Integer.parseInt(args[++i]);
                     break;
+                case "--drop-scripts":
+                    for (String s : args[++i].split(",")) {
+                        String t = s.trim().toUpperCase();
+                        if (!t.isEmpty()) dropScripts.add(t);
+                    }
+                    break;
                 case "--dry-run":
                     dryRun = true;
                     break;
@@ -176,13 +212,19 @@ public class BuildJunkTrainingData {
         System.out.println("=== BuildJunkTrainingData ===");
         System.out.println("  data-dir:           " + dataDir);
         System.out.println("  output-dir:         " + outputDir);
-        System.out.printf( "  total-budget-bytes: %,d (%.1f MB)%n",
+        System.out.printf( "  total-budget-bytes:    %,d (%.1f MB)%n",
                 totalBudgetBytes, totalBudgetBytes / 1_000_000.0);
-        System.out.printf( "  min-bytes:          %d%n", minBytes);
-        System.out.printf( "  max-punc-frac:      %.2f%n", maxPuncFrac);
-        System.out.printf( "  min-dev-sentences:  %d  (min total ≈ %d)%n",
+        System.out.printf( "  per-language-cap:      %,d (%.1f MB)%n",
+                perLanguageCapBytes, perLanguageCapBytes / 1_000_000.0);
+        System.out.printf( "  min-bytes:             %d%n", minBytes);
+        System.out.printf( "  max-punc-frac:         %.2f%n", maxPuncFrac);
+        System.out.printf( "  min-target-script-frac: %.2f%n", 
minTargetScriptFrac);
+        System.out.printf( "  min-dev-sentences:     %d  (min total ≈ %d)%n",
                 minDevSentences, (int)(minDevSentences / DEV_FRAC));
-        System.out.println("  dry-run:            " + dryRun);
+        if (!dropScripts.isEmpty()) {
+            System.out.println("  drop-scripts:          " + dropScripts);
+        }
+        System.out.println("  dry-run:               " + dryRun);
 
         if (!Files.isDirectory(dataDir)) {
             System.err.println("ERROR: data-dir not found: " + dataDir);
@@ -208,6 +250,15 @@ public class BuildJunkTrainingData {
                 System.out.printf("  %-12s → %s%n", lang, script);
             }
         }
+
+        if (!dropScripts.isEmpty()) {
+            for (String s : dropScripts) {
+                if (scriptGroups.remove(s) != null) {
+                    System.out.printf("  DROP script: %s%n", s);
+                }
+            }
+        }
+
         System.out.printf("%n  → %d languages, %d script groups%n",
                 langToScript.size(), scriptGroups.size());
 
@@ -273,8 +324,16 @@ public class BuildJunkTrainingData {
             String script = budgetEntry.getKey();
             long budget = budgetEntry.getValue();
             List<Path> langDirs = scriptGroups.get(script);
+            Character.UnicodeScript targetScript = parseUnicodeScript(script);
 
             long perLangBytes = Math.max(budget / langDirs.size(), 1L);
+            // Apply per-language cap on top of the even split, but only for
+            // multi-language buckets.  For single-language scripts (e.g. 
KHMER,
+            // HANGUL), the cap would needlessly limit a bucket that has only
+            // one source; let it consume its full budget instead.
+            long capPerLang = langDirs.size() > 1
+                    ? Math.min(perLangBytes, perLanguageCapBytes)
+                    : perLangBytes;
             List<String> sentences = new ArrayList<>();
             long totalBytesLoaded = 0;
 
@@ -282,8 +341,10 @@ public class BuildJunkTrainingData {
                 long remaining = budget - totalBytesLoaded;
                 if (remaining <= 0) break;
                 long langBytes = loadSentences(langDir,
-                        Math.min(perLangBytes, remaining),
-                        minBytes, maxPuncFrac, sentences);
+                        Math.min(capPerLang, remaining),
+                        minBytes, maxPuncFrac,
+                        targetScript, minTargetScriptFrac,
+                        sentences);
                 totalBytesLoaded += langBytes;
                 if (langBytes > 0) {
                     System.out.printf("  %-12s %-20s +%,d bytes%n",
@@ -327,7 +388,11 @@ public class BuildJunkTrainingData {
 
                 long newBudget = budget + extra;
                 List<Path> langDirs = scriptGroups.get(script);
+                Character.UnicodeScript targetScript = 
parseUnicodeScript(script);
                 long perLangBytes = Math.max(newBudget / langDirs.size(), 1L);
+                long capPerLang = langDirs.size() > 1
+                        ? Math.min(perLangBytes, perLanguageCapBytes)
+                        : perLangBytes;
 
                 List<String> sentences = new ArrayList<>();
                 long totalBytesLoaded = 0;
@@ -335,8 +400,10 @@ public class BuildJunkTrainingData {
                     long remaining = newBudget - totalBytesLoaded;
                     if (remaining <= 0) break;
                     long langBytes = loadSentences(langDir,
-                            Math.min(perLangBytes, remaining),
-                            minBytes, maxPuncFrac, sentences);
+                            Math.min(capPerLang, remaining),
+                            minBytes, maxPuncFrac,
+                            targetScript, minTargetScriptFrac,
+                            sentences);
                     totalBytesLoaded += langBytes;
                 }
                 if (!sentences.isEmpty()) {
@@ -415,6 +482,21 @@ public class BuildJunkTrainingData {
         System.out.println("Done.");
     }
 
+    /**
+     * Parses a script-bucket name (e.g. {@code "HAN"}) into a
+     * {@link Character.UnicodeScript}, or returns {@code null} if the name
+     * does not correspond to a real script (e.g. {@code "COMMON"} or any
+     * future synthetic bucket).  Used by the corpus builder to look up the
+     * target script for the {@code min-target-script-frac} filter.
+     */
+    static Character.UnicodeScript parseUnicodeScript(String name) {
+        try {
+            return Character.UnicodeScript.valueOf(name);
+        } catch (IllegalArgumentException e) {
+            return null;
+        }
+    }
+
     // -----------------------------------------------------------------------
     // Script detection
     // -----------------------------------------------------------------------
@@ -531,6 +613,22 @@ public class BuildJunkTrainingData {
      */
     static long loadSentences(Path langDir, long maxBytes, int minBytes,
                                double maxPuncFrac, List<String> result) {
+        // Backwards-compatible overload: no target-script filter.
+        return loadSentences(langDir, maxBytes, minBytes, maxPuncFrac,
+                null, 0.0, result);
+    }
+
+    /**
+     * Same as the 5-arg overload, but additionally drops sentences whose
+     * fraction of {@code targetScript} codepoints (relative to all non-
+     * COMMON/INHERITED codepoints) is below {@code minTargetScriptFrac}.
+     * Passing {@code targetScript == null} disables the target-script filter.
+     */
+    static long loadSentences(Path langDir, long maxBytes, int minBytes,
+                               double maxPuncFrac,
+                               Character.UnicodeScript targetScript,
+                               double minTargetScriptFrac,
+                               List<String> result) {
         long bytesLoaded = 0;
         for (String filename : new String[]{"sentences_wikipedia.txt", 
"sentences_madlad.txt"}) {
             if (bytesLoaded >= maxBytes) {
@@ -553,7 +651,8 @@ public class BuildJunkTrainingData {
                         if (text.isEmpty()) {
                             continue;
                         }
-                        String filtered = filterSentence(text, minBytes, 
maxPuncFrac);
+                        String filtered = filterSentence(text, minBytes, 
maxPuncFrac,
+                                targetScript, minTargetScriptFrac);
                         if (filtered != null) {
                             int sentBytes = 
filtered.getBytes(StandardCharsets.UTF_8).length;
                             result.add(filtered);
@@ -577,6 +676,18 @@ public class BuildJunkTrainingData {
      * @return the normalised sentence, or {@code null} if it should be 
discarded
      */
     static String filterSentence(String text, int minBytes, double 
maxPuncFrac) {
+        return filterSentence(text, minBytes, maxPuncFrac, null, 0.0);
+    }
+
+    /**
+     * Same as the 3-arg overload, but additionally rejects sentences whose
+     * fraction of {@code targetScript} codepoints (over non-COMMON/INHERITED
+     * codepoints) is below {@code minTargetScriptFrac}.  If {@code
+     * targetScript == null} the target-script filter is skipped.
+     */
+    static String filterSentence(String text, int minBytes, double maxPuncFrac,
+                                  Character.UnicodeScript targetScript,
+                                  double minTargetScriptFrac) {
         if (text.indexOf('\uFFFD') >= 0) {
             return null;
         }
@@ -586,17 +697,34 @@ public class BuildJunkTrainingData {
         }
         int cpCount = 0;
         int puncCount = 0;
+        int scriptCpTotal = 0;
+        int scriptCpMatching = 0;
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             cpCount++;
             if (cp >= 0x21 && cp <= 0x7E && !Character.isLetter(cp)) {
                 puncCount++;
             }
+            if (targetScript != null) {
+                Character.UnicodeScript s = Character.UnicodeScript.of(cp);
+                if (s != Character.UnicodeScript.COMMON
+                        && s != Character.UnicodeScript.INHERITED
+                        && s != Character.UnicodeScript.UNKNOWN) {
+                    scriptCpTotal++;
+                    if (s == targetScript) {
+                        scriptCpMatching++;
+                    }
+                }
+            }
             i += Character.charCount(cp);
         }
         if (cpCount > 0 && (double) puncCount / cpCount > maxPuncFrac) {
             return null;
         }
+        if (targetScript != null && scriptCpTotal > 0
+                && (double) scriptCpMatching / scriptCpTotal < 
minTargetScriptFrac) {
+            return null;
+        }
         return text;
     }
 
@@ -624,23 +752,33 @@ public class BuildJunkTrainingData {
 
     private static void printUsage() {
         System.err.println("Usage: BuildJunkTrainingData [options]");
-        System.err.println("  --data-dir              <path>  MADLAD data root"
+        System.err.println("  --data-dir               <path>  MADLAD data 
root"
                 + " (default: ~/datasets/madlad/data)");
-        System.err.println("  --output-dir            <path>  Output directory"
+        System.err.println("  --output-dir             <path>  Output 
directory"
                 + " (default: ~/datasets/madlad/junkdetect)");
-        System.err.println("  --script-sample-lines   N       Lines per 
language for script"
+        System.err.println("  --script-sample-lines    N       Lines per 
language for script"
                 + " detection (default: 2000)");
-        System.err.println("  --total-budget-bytes    N       Total UTF-8 
bytes across all"
+        System.err.println("  --total-budget-bytes     N       Total UTF-8 
bytes across all"
                 + " scripts (default: 50000000)");
-        System.err.println("  --min-bytes             N       Min UTF-8 bytes 
per sentence"
+        System.err.println("  --per-language-cap-bytes N       Max UTF-8 bytes 
contributed by any"
+                + " single language to its script bucket (default: 5000000).  
Prevents one large"
+                + " language source from dominating a multi-language bucket.");
+        System.err.println("  --min-bytes              N       Min UTF-8 bytes 
per sentence"
                 + " (default: 50)");
-        System.err.println("  --max-punc-frac         F       Max ASCII punct 
fraction"
+        System.err.println("  --max-punc-frac          F       Max ASCII punct 
fraction"
                 + " (default: 0.30)");
-        System.err.println("  --min-dev-sentences     N       Min sentences in 
dev split for a"
+        System.err.println("  --min-target-script-frac F       Min fraction of 
non-COMMON cps that"
+                + " must be in the bucket's target script for a sentence to be 
kept"
+                + " (default: 0.05).  Filters off-target Wikipedia stubs (e.g. 
English-about-Gothic"
+                + " articles in the GOTHIC bucket).");
+        System.err.println("  --min-dev-sentences      N       Min sentences 
in dev split for a"
                 + " script to be included (default: 500). Scripts below this 
floor"
                 + " have unreliable calibration and inflated FPR.");
-        System.err.println("  --seed                  N       Random seed 
(default: 42)");
-        System.err.println("  --dry-run                       Detect scripts + 
show budget,"
+        System.err.println("  --drop-scripts           S,S,..  Comma-separated 
script bucket names"
+                + " to exclude (e.g. GOTHIC,THAANA).  Use when source data is 
too thin or off-"
+                + " target for reliable distribution estimates.");
+        System.err.println("  --seed                   N       Random seed 
(default: 42)");
+        System.err.println("  --dry-run                        Detect scripts 
+ show budget,"
                 + " skip file writing");
     }
 }

Reply via email to