(tika) 04/06: step 4

tallison Fri, 17 Apr 2026 10:59:40 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch charset-ship-today
in repository https://gitbox.apache.org/repos/asf/tika.git


commit ae11a89458bb33fe77675560ae3200224b6bac25
Author: tallison <[email protected]>
AuthorDate: Fri Apr 17 09:16:41 2026 -0400

    step 4
---
 .../org/apache/tika/detect/AutoDetectReader.java   |   9 +-
 .../org/apache/tika/detect/CharsetSupersets.java   |  89 +++++
 .../apache/tika/metadata/TikaCoreProperties.java   |  12 +
 .../ml/chardetect/ByteNgramFeatureExtractor.java   | 186 +++------
 .../chardetect/tools/BuildCharsetTrainingData.java |  21 +-
 .../ConfigurableByteNgramFeatureExtractor.java     | 416 ---------------------
 .../ml/chardetect/tools/EvalCharsetDetectors.java  |   2 +-
 .../ml/chardetect/tools/TraceCharsetLogits.java    |  15 +-
 .../ml/chardetect/tools/TrainCharsetModel.java     | 125 +++----
 .../chardetect/ConfigurableGlobalFeatureTest.java  | 233 ------------
 .../ml/chardetect/FeatureExtractorParityTest.java  | 354 ------------------
 11 files changed, 235 insertions(+), 1227 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java 
b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index f306f69548..9e6c23297f 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -99,9 +99,14 @@ public class AutoDetectReader extends BufferedReader {
         // Ask all given detectors for the character encoding
         List<EncodingResult> results = detector.detect(tis, metadata, new 
ParseContext());
         if (!results.isEmpty()) {
-            return results.get(0).getCharset();
+            Charset detected = results.get(0).getCharset();
+            Charset superset = CharsetSupersets.supersetOf(detected);
+            if (superset != null) {
+                metadata.set(TikaCoreProperties.DECODED_CHARSET, 
superset.name());
+                return superset;
+            }
+            return detected;
         }
-        Charset charset = null;
 
         // Try determining the encoding based on hints in document metadata
         MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java 
b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
new file mode 100644
index 0000000000..f53c98f847
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Maps detected charsets to safer superset charsets for decoding.
+ *
+ * <p>When Tika detects a charset that is a strict subset of a broader 
encoding,
+ * it is safer to decode with the superset — the superset handles all byte
+ * sequences the subset can produce, plus the extension characters the subset
+ * cannot represent. Decoding with only the subset risks mojibake on any
+ * extension characters present in the document.</p>
+ *
+ * <p>Policy: Content-Type and detected-encoding metadata report the 
<em>detected</em>
+ * charset. Actual stream decoding uses the superset. The superset used is 
recorded
+ * in {@link org.apache.tika.metadata.TikaCoreProperties#DECODED_CHARSET}.</p>
+ *
+ * <h3>Superset map</h3>
+ * <ul>
+ *   <li>EUC-KR → x-windows-949 (MS949 is a strict superset: all EUC-KR byte 
sequences
+ *       decode identically, extension chars in x-windows-949 would mojibake 
under EUC-KR)</li>
+ *   <li>Big5 → Big5-HKSCS (HKSCS adds Hong Kong Supplementary Characters)</li>
+ *   <li>GB2312 → GB18030 (GB18030 is a strict superset of both GB2312 and 
GBK)</li>
+ *   <li>GBK → GB18030 (GB18030 is a strict superset; enables 4-byte extension 
sequences)</li>
+ *   <li>Shift_JIS → windows-31j (MS932 is a strict superset with NEC/IBM 
extensions)</li>
+ * </ul>
+ */
+public final class CharsetSupersets {
+
+    /**
+     * Maps detected charset canonical names (case-sensitive, as returned by
+     * {@link Charset#name()}) to their superset charset canonical name.
+     */
+    public static final Map<String, String> SUPERSET_MAP;
+
+    static {
+        Map<String, String> m = new HashMap<>();
+        m.put("EUC-KR",    "x-windows-949");
+        m.put("Big5",      "Big5-HKSCS");
+        m.put("GB2312",    "GB18030");
+        m.put("GBK",       "GB18030");
+        m.put("Shift_JIS", "windows-31j");
+        SUPERSET_MAP = Collections.unmodifiableMap(m);
+    }
+
+    private CharsetSupersets() {
+    }
+
+    /**
+     * Returns the superset charset to use for decoding, or {@code null} if
+     * {@code detected} has no superset override.
+     *
+     * @param detected the charset returned by the encoding detector
+     * @return superset charset, or {@code null} if none is defined
+     */
+    public static Charset supersetOf(Charset detected) {
+        if (detected == null) {
+            return null;
+        }
+        String supersetName = SUPERSET_MAP.get(detected.name());
+        if (supersetName == null) {
+            return null;
+        }
+        try {
+            return Charset.forName(supersetName);
+        } catch (IllegalArgumentException e) {
+            return null;
+        }
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 6d513a2a67..06e0ce4f2c 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -437,6 +437,18 @@ public interface TikaCoreProperties {
     Property ENCODING_DETECTION_TRACE =
             Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace");
 
+    /**
+     * The charset actually used to decode the stream when a superset override 
was applied.
+     * When the detected encoding (reported in Content-Type and {@link 
#DETECTED_ENCODING}) is
+     * a subset of a safer, broader charset (e.g. EUC-KR is a subset of 
x-windows-949, or
+     * GB2312 is a subset of GB18030), Tika decodes using the superset charset 
to avoid
+     * mojibake on extension characters. This field records the superset 
charset name so
+     * callers know which codec was actually used. Absent when detection and 
decoding use
+     * the same charset.
+     */
+    Property DECODED_CHARSET =
+            Property.externalText(TIKA_META_PREFIX + "decodedCharset");
+
     /**
      * General metadata key for the count of non-final versions available 
within a file.  This
      * was added initially to support generalizing incremental updates in PDF.
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
index baa67fbc47..fbdac3199d 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
@@ -22,30 +22,15 @@ import org.apache.tika.ml.FeatureExtractor;
  * Feature extractor for raw bytes for charset detection, using FNV-1a hashing
  * into a fixed-width bucket array.
  *
- * <h3>Feature set (fixed — UB-AS)</h3>
- * <p>This production extractor uses the feature set selected by grid search 
over
- * the MadLAD-derived {@code charset-detect3} corpus (34 charsets, 3 runs × 6
- * configs × 3 bucket sizes, devtest accuracy averaged to reduce SGD noise):
- * <strong>unigrams + bigrams + anchored bigrams + stride-2 bigrams</strong>
- * (UB-AS), 16384 buckets.</p>
+ * <h3>Feature set (fixed — UB-A)</h3>
+ * <p>This production extractor emits <strong>high-byte-anchored unigrams,
+ * bigrams, and anchored bigrams</strong> plus a single ASCII-density global
+ * feature.  The total feature-vector dimension is {@link #NUM_BUCKETS}.</p>
  *
- * <p>Key findings from the ablation/grid search:</p>
- * <ul>
- *   <li>Trigrams (T) added no accuracy over UB-AS and were dropped.</li>
- *   <li>Stride-2 bigrams (S) are the single most important new feature —
- *       they lifted overall accuracy from ~73% (old UBT- model without 
UTF-16/32
- *       training) to ~95% by giving the model direct code-unit visibility into
- *       UTF-16/32 structure.</li>
- *   <li>Anchored bigrams (A) add ~0.04% at 16384 buckets — tiny but 
consistent.</li>
- *   <li>Accuracy plateau between 8192 and 32768 buckets is within SGD noise;
- *       16384 chosen as the best size/accuracy trade-off.</li>
- * </ul>
- *
- * <p>The feature flags are intentionally not configurable here — the shipped 
model
+ * <p>The feature flags are intentionally not configurable — the shipped model
  * was trained with exactly this configuration, and using any other combination
- * at inference time would produce silently wrong predictions.
- * For training new models with different feature combinations, use
- * {@code ConfigurableByteNgramFeatureExtractor} in the training-tools 
module.</p>
+ * at inference time would produce silently wrong predictions.  Design choices
+ * are tracked in git rather than at the command line.</p>
  *
  * <h3>Features emitted</h3>
  * <ul>
@@ -64,25 +49,32 @@ import org.apache.tika.ml.FeatureExtractor;
  *       cross-character boundary structure in Shift-JIS and Big5 where trail
  *       bytes fall below 0x80 (0x40–0x7E). A distinct salt ({@code 
FNV_ANCHOR_SALT})
  *       prevents hash collisions with stride-1 bigrams.</li>
- *   <li><strong>Stride-2 bigrams</strong>: pairs {@code (b[i], b[i+1])} 
sampled
- *       at even positions {@code i = 0, 2, 4, ...}, covering all bytes (not 
just
- *       high bytes). These pairs directly reflect code-unit structure: 
UTF-16LE
- *       BMP text produces many {@code (XX, 0x00)} pairs; UTF-16BE produces
- *       {@code (0x00, XX)}. A distinct FNV salt ({@code FNV_STRIDE2_SALT})
- *       prevents hash collisions with stride-1 features. The BOM must be
- *       stripped upstream before bytes reach this extractor so that offset 0
- *       always aligns with a real code unit, matching the BOM-free training
- *       data.</li>
+ *   <li><strong>ASCII-density global</strong>: exactly one of
+ *       {@link #GLOBAL_FEATURE_COUNT} bins fires per probe, based on the
+ *       fraction of bytes that are printable ASCII (see
+ *       {@link #asciiDensityBin(byte[])}).  Helps the model condition its
+ *       Western-European vs CJK vs EBCDIC decision on overall probe 
shape.</li>
  * </ul>
  *
- * <h3>Why the high-byte filter matters for stride-1 features</h3>
+ * <h3>UTF-16 detection is owned by the UTF-16 specialist</h3>
+ * <p>Stride-2 bigrams previously emitted here were the model's primary UTF-16
+ * signal.  They are no longer emitted: UTF-16 detection is now handled by
+ * {@code Utf16SpecialistEncodingDetector}, which uses column-aggregate byte-
+ * range features.  That specialist correctly handles Latin, Cyrillic, Arabic,
+ * Hebrew, Indic, Thai, CJK Unified, and Hangul UTF-16 alike — including the
+ * CJK UTF-16 cases that a printable-ASCII-filtered stride-2 would have
+ * missed (common Chinese U+4E00–U+7EFF and hiragana U+3040–U+309F are
+ * frequently in the {@code [0x20, 0x7E]} range).  Native multi-byte CJK
+ * (Shift_JIS / GB18030 / Big5 / EUC-*) is still discriminated here via
+ * high-byte-anchored bigrams — all CJK lead bytes are {@code >= 0x81}.</p>
+ *
+ * <h3>Why the high-byte filter matters</h3>
  * <p>Training data is clean text (no HTML tags). Inference data is often raw
  * HTML (many ASCII tag bytes). Without the filter, the model would see a
  * different byte distribution at inference time than at training time. By
  * ignoring bytes below 0x80 entirely for stride-1 features, HTML tags are
  * invisible to both the training and inference feature computation — no
- * stripping needed. Stride-2 features intentionally include all bytes because
- * the low bytes are the signal (e.g. the 0x00 high byte in UTF-16 BMP 
text).</p>
+ * stripping needed.</p>
  */
 public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> {
 
@@ -90,84 +82,37 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
     private static final int FNV_OFFSET       = 0x811c9dc5;
     /** Distinct salt for anchored bigrams (high→low boundary) — prevents 
collision with stride-1. */
     private static final int FNV_ANCHOR_SALT  = 0x27d4eb2f;
-    /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 
hashes. */
-    private static final int FNV_STRIDE2_SALT = 0x9e3779b9;
+
+    /** Total feature-vector dimension used by the shipped model (including 
global slots). */
+    public static final int NUM_BUCKETS = 16390;
 
     /**
-     * Number of reserved slots at the high end of the feature vector used for
-     * global (whole-probe) features when {@link #useGlobalFeatures} is 
enabled.
-     * Currently 6 slots hold ASCII-text-density bins (see
-     * {@link #asciiDensityBin(byte[])}).  Must match the training-side
-     * {@code ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT}.
+     * Number of reserved slots at the high end of the feature vector for
+     * global (whole-probe) features. The last 6 slots hold ASCII-text-density
+     * bins (see {@link #asciiDensityBin(byte[])}). Always active.
      */
     public static final int GLOBAL_FEATURE_COUNT = 6;
 
     private final int numBuckets;
-    private final int stride1Buckets;
-    private final int stride2Buckets;
-    private final int stride2Base;
-    private final int globalBase;
-    private final boolean useGlobalFeatures;
-    private final boolean useSplitSpaces;
+    private final int hashSpace;   // numBuckets - GLOBAL_FEATURE_COUNT
+    private final int globalBase;  // = hashSpace (first of 6 global slots)
 
     /**
-     * Legacy constructor: no globals, shared stride-1/stride-2 hash space.
-     * Matches the layout used by the shipped {@code 
chardetect-v6-no-utf32.bin}.
-     *
-     * @param numBuckets number of hash buckets (feature-vector dimension)
+     * @param numBuckets total feature-vector dimension, including the
+     *                   {@link #GLOBAL_FEATURE_COUNT} global slots at the end.
      */
     public ByteNgramFeatureExtractor(int numBuckets) {
-        this(numBuckets, false, false);
-    }
-
-    /**
-     * Create an extractor matching the layout of a trained model.
-     *
-     * @param numBuckets         total feature-vector dimension.
-     * @param useGlobalFeatures  reserve the last {@link #GLOBAL_FEATURE_COUNT}
-     *                           slots for ASCII-density bin features.
-     * @param useSplitSpaces     split the hash space 50/50 between stride-1
-     *                           features (low half) and stride-2 features
-     *                           (high half) so cross-family hash collisions
-     *                           cannot pollute single-byte-charset weights
-     *                           with stride-2 signals.
-     */
-    public ByteNgramFeatureExtractor(int numBuckets,
-                                     boolean useGlobalFeatures,
-                                     boolean useSplitSpaces) {
-        if (numBuckets <= 0) {
-            throw new IllegalArgumentException("numBuckets must be positive: " 
+ numBuckets);
-        }
-        int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0;
-        int hashSpace = numBuckets - globalsReserved;
-        if (hashSpace <= 0) {
+        if (numBuckets <= GLOBAL_FEATURE_COUNT) {
             throw new IllegalArgumentException(
-                    "numBuckets must exceed GLOBAL_FEATURE_COUNT when 
useGlobalFeatures=true: "
-                            + numBuckets);
-        }
-        if (useSplitSpaces && hashSpace < 2) {
-            throw new IllegalArgumentException(
-                    "useSplitSpaces requires hashSpace >= 2: " + hashSpace);
+                    "numBuckets must exceed GLOBAL_FEATURE_COUNT: " + 
numBuckets);
         }
         this.numBuckets = numBuckets;
-        this.useSplitSpaces = useSplitSpaces;
-        this.useGlobalFeatures = useGlobalFeatures;
-        if (useSplitSpaces) {
-            this.stride1Buckets = hashSpace / 2;
-            this.stride2Buckets = hashSpace - this.stride1Buckets;
-            this.stride2Base = this.stride1Buckets;
-        } else {
-            this.stride1Buckets = hashSpace;
-            this.stride2Buckets = hashSpace;
-            this.stride2Base = 0;
-        }
+        this.hashSpace  = numBuckets - GLOBAL_FEATURE_COUNT;
         this.globalBase = hashSpace;
     }
 
     /**
      * Returns which ASCII-text-density bin this probe falls into, in [0, 6).
-     * Must match the training-side
-     * {@code ConfigurableByteNgramFeatureExtractor.asciiDensityBin}.
      *
      * <p>Bin layout (fraction of bytes that are ASCII-text: printable
      * {@code 0x20..0x7E} plus {@code 0x09 0x0A 0x0D}):</p>
@@ -285,28 +230,12 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
             }
         }
 
-        // Stride-2: code-unit pairs at positions 0, 2, 4, ...
-        // Covers all bytes (not just high bytes) so UTF-16 null bytes are 
visible.
-        for (int i = 0; i + 1 < input.length; i += 2) {
-            int b0 = input[i] & 0xFF;
-            int b1 = input[i + 1] & 0xFF;
-            int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
-            h = (h ^ b1) * FNV_PRIME;
-            int bkt = stride2Bucket(h);
-            if (dense[bkt] == 0) {
-                touched[n++] = bkt;
-            }
-            dense[bkt]++;
-        }
-
-        // Global features: fire exactly one ASCII-density bin.
-        if (useGlobalFeatures) {
-            int bkt = globalBase + asciiDensityBin(input);
-            if (dense[bkt] == 0) {
-                touched[n++] = bkt;
-            }
-            dense[bkt]++;
+        // Global feature: fire exactly one ASCII-density bin.
+        int bkt = globalBase + asciiDensityBin(input);
+        if (dense[bkt] == 0) {
+            touched[n++] = bkt;
         }
+        dense[bkt]++;
 
         return n;
     }
@@ -332,29 +261,14 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
             }
         }
 
-        // Stride-2 bigrams (same logic as extractSparseInto).
-        for (int i = from; i + 1 < to; i += 2) {
-            int b0 = b[i] & 0xFF;
-            int b1 = b[i + 1] & 0xFF;
-            int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
-            h = (h ^ b1) * FNV_PRIME;
-            counts[stride2Bucket(h)]++;
-        }
-
-        // Global features: fire exactly one ASCII-density bin.
-        if (useGlobalFeatures) {
-            byte[] slice = (from == 0 && to == b.length)
-                    ? b : java.util.Arrays.copyOfRange(b, from, to);
-            counts[globalBase + asciiDensityBin(slice)]++;
-        }
+        // Global feature: fire exactly one ASCII-density bin.
+        byte[] slice = (from == 0 && to == b.length)
+                ? b : java.util.Arrays.copyOfRange(b, from, to);
+        counts[globalBase + asciiDensityBin(slice)]++;
     }
 
     private int stride1Bucket(int hash) {
-        return (hash & 0x7fffffff) % stride1Buckets;
-    }
-
-    private int stride2Bucket(int hash) {
-        return stride2Base + (hash & 0x7fffffff) % stride2Buckets;
+        return (hash & 0x7fffffff) % hashSpace;
     }
 
     @Override
@@ -382,6 +296,6 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
     @Override
     public String toString() {
         return String.format(java.util.Locale.ROOT,
-                "ByteNgramFeatureExtractor{buckets=%d, UB-AS}", numBuckets);
+                "ByteNgramFeatureExtractor{buckets=%d, UB-A}", numBuckets);
     }
 }
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
index 07e5b524e5..afd5fb4b30 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
@@ -119,6 +119,7 @@ public class BuildCharsetTrainingData {
         CHARSET_JAVA.put("Shift_JIS",      "Shift_JIS");
         CHARSET_JAVA.put("EUC-JP",         "EUC-JP");
         CHARSET_JAVA.put("EUC-KR",         "EUC-KR");
+        CHARSET_JAVA.put("x-windows-949",  "x-windows-949");
         CHARSET_JAVA.put("GB18030",        "GB18030");
         CHARSET_JAVA.put("Big5-HKSCS",     "Big5-HKSCS");
         CHARSET_JAVA.put("x-EUC-TW",      "x-EUC-TW");
@@ -153,7 +154,15 @@ public class BuildCharsetTrainingData {
         CHARSET_JAVA.put("IBM852",         "IBM852");
         // Mac Roman
         CHARSET_JAVA.put("x-MacRoman",     "x-MacRoman");
-        // EBCDIC
+        // EBCDIC — all variants are generated into the training corpus so a 
future
+        // EBCDIC specialist can be trained against them.  Today's main SBCS 
model
+        // consumes only a subset of these (see TrainCharsetModel's hardcoded
+        // exclusion list): IBM424 (Hebrew) and IBM420 (Arabic) live entirely 
in
+        // the 0x41–0x6A range, below the 0x80 threshold our feature extractor
+        // considers, so excluding them from today's model avoids training on a
+        // signal the inference path cannot see; IBM1047 is byte-identical to
+        // IBM500 on most prose bytes and is excluded to avoid near-duplicate
+        // classes in the SBCS kitchen-sink model.
         CHARSET_JAVA.put("IBM500",         "IBM500");
         CHARSET_JAVA.put("IBM1047",        "IBM1047");
         CHARSET_JAVA.put("IBM424-ltr",     "IBM424");
@@ -237,8 +246,11 @@ public class BuildCharsetTrainingData {
         put("jpn", "Shift_JIS", "EUC-JP", "ISO-2022-JP");
         // Chinese (Simplified)
         put("zho", "GB18030", "ISO-2022-CN");
-        // Korean
-        put("kor", "EUC-KR", "ISO-2022-KR");
+        // Korean — x-windows-949 (MS949) is a strict superset of EUC-KR.
+        // Trained as a separate class so the model can discriminate MS949-
+        // extension-byte content from pure-EUC-KR content.  Supersets at the
+        // decoder level (CharsetSupersets) decode EUC-KR output as MS949 
anyway.
+        put("kor", "EUC-KR", "ISO-2022-KR", "x-windows-949");
         // Thai
         put("tha", "windows-874");
         // Traditional Chinese — sourced from Cantonese Wikipedia (yue)
@@ -306,7 +318,8 @@ public class BuildCharsetTrainingData {
      * ASCII-range characters.
      */
     private static final Set<String> HIGH_BYTE_CJK = new 
HashSet<>(Arrays.asList(
-            "Shift_JIS", "EUC-JP", "EUC-KR", "GB18030", "Big5-HKSCS", 
"x-EUC-TW"
+            "Shift_JIS", "EUC-JP", "EUC-KR", "x-windows-949",
+            "GB18030", "Big5-HKSCS", "x-EUC-TW"
     ));
 
     /** RTL charsets: text is reversed (character level) before encoding. */
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
deleted file mode 100644
index 88469abab9..0000000000
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.chardetect.tools;
-
-import org.apache.tika.ml.FeatureExtractor;
-
-/**
- * Configurable byte n-gram feature extractor for use during training and
- * ablation studies.
- *
- * <p>This class exposes all hyperparameters ({@code numBuckets}, feature 
flags)
- * as constructor arguments so that training tools and annealing scripts can
- * explore the full search space.  It is intentionally kept out of the
- * production {@code tika-encoding-detector-mojibuster} module — the shipped
- * model was trained with fixed parameters (UBT-: unigrams + bigrams + 
trigrams,
- * no anchored bigrams, 8192 buckets) which are hard-coded in the production
- * {@link org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor}.</p>
- *
- * <p>Using this class at inference time against a model trained with different
- * flags would produce silently wrong predictions.</p>
- *
- * <h3>Feature flags</h3>
- * <ul>
- *   <li><b>useUnigrams</b>: emit one feature per high byte ({@code >= 
0x80})</li>
- *   <li><b>useBigrams</b>: emit one feature per (high, next) byte pair</li>
- *   <li><b>useTrigrams</b>: emit one feature per (high, next, next+1) 
triple</li>
- *   <li><b>useAnchoredBigrams</b>: emit one feature per (low-trail, next) pair
- *       when the trail byte is {@code < 0x80} — captures cross-character
- *       boundaries in encodings like Shift-JIS and Big5 with low trail 
bytes</li>
- *   <li><b>useStride2Bigrams</b>: emit one feature per (b[i], b[i+1]) pair at
- *       even positions i = 0, 2, 4, ... covering all bytes (not just high 
bytes).
- *       A distinct FNV salt prevents hash collision with stride-1 bigrams.
- *       Helps the model distinguish UTF-16BE/LE via their characteristic
- *       code-unit patterns.</li>
- * </ul>
- */
-public class ConfigurableByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
-
-    private static final int FNV_PRIME        = 0x01000193;
-    private static final int FNV_OFFSET       = 0x811c9dc5;
-    private static final int FNV_ANCHOR_SALT  = 0x27d4eb2f;
-    /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 
hashes. */
-    private static final int FNV_STRIDE2_SALT = 0x9e3779b9;
-
-    /**
-     * Number of reserved slots at the high end of the feature vector used for
-     * global (whole-probe) features when {@link #useGlobalFeatures} is 
enabled.
-     * Currently 6 slots hold ASCII-low-byte density bins (see
-     * {@link #asciiDensityBin(byte[])}).
-     */
-    public static final int GLOBAL_FEATURE_COUNT = 6;
-
-    private final int numBuckets;
-    private final int stride1Buckets;    // size of the stride-1 hash region
-    private final int stride2Buckets;    // size of the stride-2 hash region
-    private final int stride2Base;       // first slot of the stride-2 region
-    private final int globalBase;        // first slot of the globals region 
(or numBuckets if disabled)
-    private final boolean useUnigrams;
-    private final boolean useBigrams;
-    private final boolean useTrigrams;
-    private final boolean useAnchoredBigrams;
-    private final boolean useStride2Bigrams;
-    private final boolean useGlobalFeatures;
-    private final boolean useSplitSpaces;
-
-    /**
-     * Backwards-compatible constructor (no global features, no split spaces).
-     */
-    public ConfigurableByteNgramFeatureExtractor(int numBuckets,
-                                                 boolean useUnigrams,
-                                                 boolean useBigrams,
-                                                 boolean useTrigrams,
-                                                 boolean useAnchoredBigrams,
-                                                 boolean useStride2Bigrams) {
-        this(numBuckets, useUnigrams, useBigrams, useTrigrams,
-                useAnchoredBigrams, useStride2Bigrams, false);
-    }
-
-    /**
-     * Constructor with globals support, shared hash space (stride-1 and 
stride-2
-     * mod into the same bucket range).
-     */
-    public ConfigurableByteNgramFeatureExtractor(int numBuckets,
-                                                 boolean useUnigrams,
-                                                 boolean useBigrams,
-                                                 boolean useTrigrams,
-                                                 boolean useAnchoredBigrams,
-                                                 boolean useStride2Bigrams,
-                                                 boolean useGlobalFeatures) {
-        this(numBuckets, useUnigrams, useBigrams, useTrigrams,
-                useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures, 
false);
-    }
-
-    /**
-     * @param numBuckets         total feature-vector dimension.  When
-     *                           {@code useGlobalFeatures} is {@code true}, the
-     *                           last {@link #GLOBAL_FEATURE_COUNT} slots are
-     *                           reserved for global features.  When
-     *                           {@code useSplitSpaces} is {@code true}, the
-     *                           remaining hash space is split 50/50 between
-     *                           stride-1 features and stride-2 features so
-     *                           HTML-shaped stride-2 emissions cannot collide
-     *                           with single-byte-charset stride-1 weights.
-     * @param useUnigrams        emit unigram for each high byte
-     * @param useBigrams         emit bigram anchored on each high byte
-     * @param useTrigrams        emit trigram anchored on each high byte
-     * @param useAnchoredBigrams emit bigram anchored on each low trail byte
-     * @param useStride2Bigrams  emit stride-2 bigrams at even positions (all 
bytes)
-     * @param useGlobalFeatures  emit whole-probe global features into the
-     *                           reserved tail slots (ASCII-density bins)
-     * @param useSplitSpaces     give stride-1 and stride-2 features disjoint
-     *                           bucket ranges
-     */
-    public ConfigurableByteNgramFeatureExtractor(int numBuckets,
-                                                 boolean useUnigrams,
-                                                 boolean useBigrams,
-                                                 boolean useTrigrams,
-                                                 boolean useAnchoredBigrams,
-                                                 boolean useStride2Bigrams,
-                                                 boolean useGlobalFeatures,
-                                                 boolean useSplitSpaces) {
-        if (numBuckets <= 0) {
-            throw new IllegalArgumentException("numBuckets must be positive: " 
+ numBuckets);
-        }
-        int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0;
-        int hashSpace = numBuckets - globalsReserved;
-        if (hashSpace <= 0) {
-            throw new IllegalArgumentException(
-                    "numBuckets must exceed GLOBAL_FEATURE_COUNT (" + 
GLOBAL_FEATURE_COUNT
-                            + ") when useGlobalFeatures=true: " + numBuckets);
-        }
-        if (useSplitSpaces && hashSpace < 2) {
-            throw new IllegalArgumentException(
-                    "useSplitSpaces requires hashSpace >= 2: " + hashSpace);
-        }
-        this.numBuckets = numBuckets;
-        this.useSplitSpaces = useSplitSpaces;
-        if (useSplitSpaces) {
-            // 50/50 split; stride-1 gets the first half, stride-2 gets the 
second.
-            this.stride1Buckets = hashSpace / 2;
-            this.stride2Buckets = hashSpace - this.stride1Buckets;
-            this.stride2Base = this.stride1Buckets;
-        } else {
-            // Both stride families share the same hash region [0, hashSpace).
-            this.stride1Buckets = hashSpace;
-            this.stride2Buckets = hashSpace;
-            this.stride2Base = 0;
-        }
-        // Globals region always starts immediately after the hash region(s).
-        this.globalBase = hashSpace;
-        this.useUnigrams = useUnigrams;
-        this.useBigrams = useBigrams;
-        this.useTrigrams = useTrigrams;
-        this.useAnchoredBigrams = useAnchoredBigrams;
-        this.useStride2Bigrams = useStride2Bigrams;
-        this.useGlobalFeatures = useGlobalFeatures;
-    }
-
-    /**
-     * Returns which ASCII-text-density bin this probe falls into, in [0, 6).
-     *
-     * <p>Counts only <em>ASCII text bytes</em> — printable (0x20..0x7E) plus
-     * common whitespace (0x09 tab, 0x0A LF, 0x0D CR).  NUL and other control
-     * bytes do <em>not</em> count.  This matters because UTF-16LE/BE probes
-     * contain ~50% 0x00 bytes; if we counted those as "low", UTF-16 English
-     * would look like sparse Latin to the model, defeating the point of the
-     * feature.  With the current definition, real UTF-16 English lands around
-     * bin 2-3 (half ASCII-letter bytes, half nulls), distinguishable from
-     * plain-ASCII probes (bin 5) and from real EBCDIC (bin 0-1).</p>
-     *
-     * <p>Bin layout (fraction of bytes that are ASCII-text):</p>
-     * <ul>
-     *   <li>0: [0.00, 0.10) — effectively no ASCII text (real EBCDIC 
letters)</li>
-     *   <li>1: [0.10, 0.50) — heavy non-ASCII content (CJK text, UTF-16 
mixed)</li>
-     *   <li>2: [0.50, 0.80) — text with dense foreign script, UTF-16 
Latin</li>
-     *   <li>3: [0.80, 0.95) — normal foreign-script text with ASCII 
markup</li>
-     *   <li>4: [0.95, 0.99) — sparse-diacritic Western text</li>
-     *   <li>5: [0.99, 1.00] — near-pure ASCII (vCards, config, scripts)</li>
-     * </ul>
-     */
-    public static int asciiDensityBin(byte[] input) {
-        if (input == null || input.length == 0) {
-            return 5;
-        }
-        int asciiText = 0;
-        for (byte b : input) {
-            int v = b & 0xFF;
-            if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 
0x0D) {
-                asciiText++;
-            }
-        }
-        double p = (double) asciiText / input.length;
-        if (p < 0.10) {
-            return 0;
-        }
-        if (p < 0.50) {
-            return 1;
-        }
-        if (p < 0.80) {
-            return 2;
-        }
-        if (p < 0.95) {
-            return 3;
-        }
-        if (p < 0.99) {
-            return 4;
-        }
-        return 5;
-    }
-
-    @Override
-    public int[] extract(byte[] input) {
-        int[] counts = new int[numBuckets];
-        if (input == null || input.length == 0) {
-            return counts;
-        }
-        extractInto(input, 0, input.length, counts);
-        return counts;
-    }
-
-    /**
-     * Sparse extraction into caller-owned, reusable buffers.  O(probe length).
-     *
-     * @param input   raw bytes
-     * @param dense   scratch buffer of length {@code numBuckets}, all-zeros 
on entry
-     * @param touched receives indices of non-zero buckets
-     * @return number of active entries written into {@code touched}
-     */
-    public int extractSparseInto(byte[] input, int[] dense, int[] touched) {
-        if (input == null || input.length == 0) {
-            return 0;
-        }
-        int n = 0;
-
-        // Stride-1: high-byte-anchored features.
-        for (int i = 0; i < input.length; i++) {
-            int bi = input[i] & 0xFF;
-            if (bi < 0x80) {
-                continue;
-            }
-
-            if (useUnigrams) {
-                int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-                int bkt = stride1Bucket(h);
-                if (dense[bkt] == 0) {
-                    touched[n++] = bkt;
-                }
-                dense[bkt]++;
-            }
-
-            if (i + 1 < input.length) {
-                int bi1 = input[i + 1] & 0xFF;
-
-                if (useBigrams) {
-                    int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-                    h = (h ^ bi1) * FNV_PRIME;
-                    int bkt = stride1Bucket(h);
-                    if (dense[bkt] == 0) {
-                        touched[n++] = bkt;
-                    }
-                    dense[bkt]++;
-                }
-
-                if (useAnchoredBigrams && bi1 < 0x80) {
-                    int h = (FNV_ANCHOR_SALT ^ bi1) * FNV_PRIME;
-                    if (i + 2 < input.length) {
-                        h = (h ^ (input[i + 2] & 0xFF)) * FNV_PRIME;
-                    }
-                    int bkt = stride1Bucket(h);
-                    if (dense[bkt] == 0) {
-                        touched[n++] = bkt;
-                    }
-                    dense[bkt]++;
-                }
-
-                if (useTrigrams && i + 2 < input.length) {
-                    int bi2 = input[i + 2] & 0xFF;
-                    int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-                    h = (h ^ bi1) * FNV_PRIME;
-                    h = (h ^ bi2) * FNV_PRIME;
-                    int bkt = stride1Bucket(h);
-                    if (dense[bkt] == 0) {
-                        touched[n++] = bkt;
-                    }
-                    dense[bkt]++;
-                }
-            }
-        }
-
-        // Stride-2: code-unit pairs at positions 0, 2, 4, ...
-        if (useStride2Bigrams) {
-            for (int i = 0; i + 1 < input.length; i += 2) {
-                int b0 = input[i] & 0xFF;
-                int b1 = input[i + 1] & 0xFF;
-                int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
-                h = (h ^ b1) * FNV_PRIME;
-                int bkt = stride2Bucket(h);
-                if (dense[bkt] == 0) {
-                    touched[n++] = bkt;
-                }
-                dense[bkt]++;
-            }
-        }
-
-        // Global features at reserved tail slots: fire exactly one 
ASCII-density bin.
-        if (useGlobalFeatures) {
-            int bkt = globalBase + asciiDensityBin(input);
-            if (dense[bkt] == 0) {
-                touched[n++] = bkt;
-            }
-            dense[bkt]++;
-        }
-
-        return n;
-    }
-
-    private void extractInto(byte[] b, int from, int to, int[] counts) {
-        // Stride-1: high-byte-anchored features.
-        for (int i = from; i < to; i++) {
-            int bi = b[i] & 0xFF;
-            if (bi < 0x80) {
-                continue;
-            }
-
-            if (useUnigrams) {
-                counts[stride1Bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++;
-            }
-
-            if (i + 1 < to) {
-                int bi1 = b[i + 1] & 0xFF;
-
-                if (useBigrams) {
-                    int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-                    h = (h ^ bi1) * FNV_PRIME;
-                    counts[stride1Bucket(h)]++;
-                }
-
-                if (useAnchoredBigrams && bi1 < 0x80) {
-                    int h = (FNV_ANCHOR_SALT ^ bi1) * FNV_PRIME;
-                    if (i + 2 < to) {
-                        h = (h ^ (b[i + 2] & 0xFF)) * FNV_PRIME;
-                    }
-                    counts[stride1Bucket(h)]++;
-                }
-
-                if (useTrigrams && i + 2 < to) {
-                    int bi2 = b[i + 2] & 0xFF;
-                    int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-                    h = (h ^ bi1) * FNV_PRIME;
-                    h = (h ^ bi2) * FNV_PRIME;
-                    counts[stride1Bucket(h)]++;
-                }
-            }
-        }
-
-        // Stride-2 bigrams (same logic as extractSparseInto).
-        if (useStride2Bigrams) {
-            for (int i = from; i + 1 < to; i += 2) {
-                int b0 = b[i] & 0xFF;
-                int b1 = b[i + 1] & 0xFF;
-                int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
-                h = (h ^ b1) * FNV_PRIME;
-                counts[stride2Bucket(h)]++;
-            }
-        }
-
-        // Global features at reserved tail slots: fire exactly one 
ASCII-density bin.
-        if (useGlobalFeatures) {
-            byte[] slice = (from == 0 && to == b.length)
-                    ? b : java.util.Arrays.copyOfRange(b, from, to);
-            counts[globalBase + asciiDensityBin(slice)]++;
-        }
-    }
-
-    private int stride1Bucket(int hash) {
-        return (hash & 0x7fffffff) % stride1Buckets;
-    }
-
-    private int stride2Bucket(int hash) {
-        return stride2Base + (hash & 0x7fffffff) % stride2Buckets;
-    }
-
-    @Override
-    public int getNumBuckets() {
-        return numBuckets;
-    }
-
-    public boolean isUseSplitSpaces() {
-        return useSplitSpaces;
-    }
-
-    @Override
-    public String toString() {
-        return String.format(java.util.Locale.ROOT,
-                "ConfigurableByteNgramFeatureExtractor{buckets=%d, 
stride1=[0,%d) stride2=[%d,%d) globals=[%d,%d)"
-                        + " uni=%b, bi=%b, tri=%b, anchored=%b, stride2f=%b, 
globalsf=%b, split=%b}",
-                numBuckets, stride1Buckets, stride2Base, stride2Base + 
stride2Buckets,
-                globalBase, numBuckets,
-                useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams,
-                useStride2Bigrams, useGlobalFeatures, useSplitSpaces);
-    }
-}
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
index 5ca57b1669..eca49bc1c9 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
@@ -73,7 +73,7 @@ public class EvalCharsetDetectors {
     private static final double OOV_THRESHOLD_CJK  = 0.80;
     private static final double OOV_THRESHOLD_SBCS = 0.98;
     private static final Set<String> CJK_CHARSETS = Set.of(
-            "Big5", "Big5-HKSCS", "EUC-JP", "EUC-KR", "EUC-TW",
+            "Big5", "Big5-HKSCS", "EUC-JP", "EUC-KR", "EUC-TW", 
"x-windows-949",
             "GB18030", "GB2312", "GBK", "Shift_JIS"
     );
     private static final Set<String> OOV_EXEMPT = Set.of(
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
index dfe13b3ade..4a749ad124 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
@@ -53,8 +53,6 @@ public final class TraceCharsetLogits {
         List<String> focus = new ArrayList<>();
         int topBuckets = 20;
         int maxProbeBytes = 32 * 1024;
-        boolean noStride2 = false;
-
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
                 case "--probe":
@@ -74,9 +72,6 @@ public final class TraceCharsetLogits {
                 case "--max-probe-bytes":
                     maxProbeBytes = Integer.parseInt(args[++i]);
                     break;
-                case "--no-stride2":
-                    noStride2 = true;
-                    break;
                 default:
                     System.err.println("Unknown arg: " + args[i]);
                     System.exit(1);
@@ -89,15 +84,7 @@ public final class TraceCharsetLogits {
         }
 
         LinearModel model = loadModel(modelPath);
-        FeatureExtractor<byte[]> extractor = noStride2
-                // Production flags minus stride-2, matching 
FeatureExtractorParityTest
-                // for the stride-1 features (uni + bi, no trigrams, no 
anchored).
-                ? new 
ConfigurableByteNgramFeatureExtractor(model.getNumBuckets(),
-                        true, true, false, false, false)
-                : new ByteNgramFeatureExtractor(model.getNumBuckets());
-        if (noStride2) {
-            System.out.println("Stride-2 features suppressed for this run.");
-        }
+        FeatureExtractor<byte[]> extractor = new 
ByteNgramFeatureExtractor(model.getNumBuckets());
 
         byte[] allBytes = Files.readAllBytes(probePath);
         byte[] probe = allBytes.length <= maxProbeBytes
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
index 1e7a7e5cdf..d7379f4c8b 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
@@ -35,8 +35,8 @@ import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.zip.GZIPInputStream;
 
-import org.apache.tika.ml.FeatureExtractor;
 import org.apache.tika.ml.LinearModel;
+import org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor;
 import org.apache.tika.ml.chardetect.CharsetConfusables;
 
 /**
@@ -64,11 +64,43 @@ import org.apache.tika.ml.chardetect.CharsetConfusables;
  */
 public class TrainCharsetModel {
 
-    private static final int DEFAULT_NUM_BUCKETS = 16384;
+    private static final int DEFAULT_NUM_BUCKETS = 
ByteNgramFeatureExtractor.NUM_BUCKETS;
     private static final int DEFAULT_EPOCHS = 3;
     private static final float DEFAULT_LR = 0.05f;
     private static final int DEFAULT_MAX_SAMPLES = 500_000;
 
+    /**
+     * Labels excluded from the main SBCS "kitchen-sink" model by default.
+     *
+     * <p>Hardcoded here (rather than passed on the command line) so the 
model's
+     * class set is versioned in git alongside the code that uses it — past
+     * retraining runs with inconsistent CLI flags were a recurring source of
+     * mismatched inference/training feature sets.</p>
+     *
+     * <p>{@link BuildCharsetTrainingData} still generates training corpora for
+     * these labels — they are needed by future specialists (e.g. an EBCDIC
+     * specialist) — but the main SBCS model doesn't consume them today:</p>
+     * <ul>
+     *   <li><b>IBM424-ltr/rtl</b> (Hebrew EBCDIC) — content bytes occupy 
0x41–0x6A,
+     *       entirely below the 0x80 threshold the shipped
+     *       {@link ByteNgramFeatureExtractor} considers.  Training on these
+     *       labels teaches weights the inference path cannot ever match.</li>
+     *   <li><b>IBM420-ltr/rtl</b> (Arabic EBCDIC) — same reason as 
IBM424.</li>
+     *   <li><b>IBM1047</b> (z/OS Unix System Services Latin-1) — 
byte-identical
+     *       to IBM500 on most prose; having both as classes just splits the
+     *       EBCDIC-Latin signal without adding discrimination the model can
+     *       use.</li>
+     * </ul>
+     *
+     * <p>CLI {@code --exclude} is unioned with this set, not replaced, so an
+     * operator can add further exclusions but cannot accidentally suppress
+     * the hardcoded policy.</p>
+     */
+    static final Set<String> TODAY_SBCS_EXCLUDE = Set.of(
+            "IBM424-ltr", "IBM424-rtl",
+            "IBM420-ltr", "IBM420-rtl",
+            "IBM1047");
+
     public static void main(String[] args) throws IOException {
         Path dataDir = null;
         Path outputPath = Paths.get("chardetect.bin");
@@ -76,17 +108,12 @@ public class TrainCharsetModel {
         int epochs = DEFAULT_EPOCHS;
         float lr = DEFAULT_LR;
         int maxSamplesPerClass = DEFAULT_MAX_SAMPLES;
-        boolean useUnigrams = true;
-        boolean useBigrams = true;
-        boolean useTrigrams = true;
-        boolean useAnchoredBigrams = false;
-        boolean useStride2Bigrams = true;
-        boolean useGlobalFeatures = false;
-        boolean useSplitSpaces = false;
         // --label-remap src1:dst1,src2:dst2 — merges multiple source labels 
into
         // one target label at training time (e.g. merge script variants into 
one class).
         Map<String, String> labelRemap = new HashMap<>();
-        Set<String> excludeLabels = new java.util.HashSet<>();
+        // Start from the hardcoded SBCS-kitchen-sink exclusion list; CLI
+        // --exclude adds to it but cannot override.
+        Set<String> excludeLabels = new 
java.util.HashSet<>(TODAY_SBCS_EXCLUDE);
 
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
@@ -118,42 +145,6 @@ public class TrainCharsetModel {
                         labelRemap.put(kv[0].trim(), kv[1].trim());
                     }
                     break;
-                case "--no-uni":
-                    useUnigrams = false;
-                    break;
-                case "--no-bi":
-                    useBigrams = false;
-                    break;
-                case "--tri":
-                    useTrigrams = true;
-                    break;
-                case "--no-tri":
-                    useTrigrams = false;
-                    break;
-                case "--anchored":
-                    useAnchoredBigrams = true;
-                    break;
-                case "--no-anchored":
-                    useAnchoredBigrams = false;
-                    break;
-                case "--stride2":
-                    useStride2Bigrams = true;
-                    break;
-                case "--no-stride2":
-                    useStride2Bigrams = false;
-                    break;
-                case "--globals":
-                    useGlobalFeatures = true;
-                    break;
-                case "--no-globals":
-                    useGlobalFeatures = false;
-                    break;
-                case "--split-spaces":
-                    useSplitSpaces = true;
-                    break;
-                case "--no-split-spaces":
-                    useSplitSpaces = false;
-                    break;
                 case "--exclude":
                     for (String label : args[++i].split(",")) {
                         excludeLabels.add(label.trim());
@@ -173,14 +164,8 @@ public class TrainCharsetModel {
             System.err.println("  --max-samples-per-class N");
             System.err.println("  --label-remap src1:dst1,src2:dst2");
             System.err.println("                           merge source labels 
into a single target label");
-            System.err.println("  --no-uni                 disable unigram 
features");
-            System.err.println("  --no-bi                  disable bigram 
features");
-            System.err.println("  --tri / --no-tri         enable/disable 
trigram features (default: on)");
-            System.err.println("  --anchored / --no-anchored  anchored bigrams 
(default: off)");
-            System.err.println("  --stride2 / --no-stride2    stride-2 bigrams 
at even positions (default: on)");
-            System.err.println("  --globals / --no-globals    emit global 
ASCII-density bin features (default: off)");
-            System.err.println("  --split-spaces / --no-split-spaces  give 
stride-1 and stride-2 features disjoint bucket ranges (default: off)");
-            System.err.println("  --exclude cs1,cs2          skip these 
charset labels (e.g. UTF-32-BE,UTF-32-LE)");
+            System.err.println("  --exclude cs1,cs2          ADD these to the 
hardcoded exclusion list "
+                    + TODAY_SBCS_EXCLUDE);
             System.exit(1);
         }
 
@@ -226,15 +211,8 @@ public class TrainCharsetModel {
         System.out.printf(java.util.Locale.ROOT,
                 "Buckets: %d  epochs: %d  lr: %.4f  max-samples/class: %d%n",
                 numBuckets, epochs, lr, maxSamplesPerClass);
-        System.out.printf(java.util.Locale.ROOT,
-                "Features: uni=%b  bi=%b  tri=%b  anchored=%b  stride2=%b  
globals=%b  split=%b%n",
-                useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, 
useStride2Bigrams,
-                useGlobalFeatures, useSplitSpaces);
 
-        ConfigurableByteNgramFeatureExtractor extractor =
-                new ConfigurableByteNgramFeatureExtractor(numBuckets,
-                        useUnigrams, useBigrams, useTrigrams, 
useAnchoredBigrams,
-                        useStride2Bigrams, useGlobalFeatures, useSplitSpaces);
+        ByteNgramFeatureExtractor extractor = new 
ByteNgramFeatureExtractor(numBuckets);
 
         // Build class index map
         Map<String, Integer> labelIndex = new HashMap<>();
@@ -299,12 +277,18 @@ public class TrainCharsetModel {
                 // Sparse extraction: O(probeLength), not O(numBuckets)
                 int nActive = extractor.extractSparseInto(sample, 
denseScratch, touched);
 
-                // Forward pass: only iterate active buckets
+                // Per-bucket contribution clip matching 
LinearModel.predictLogits at inference.
+                // Prevents any single colliding bucket from dominating the 
logit.
+                float clip = 1.5f * (float) Math.sqrt(nActive);
+
+                // Forward pass: clipped contributions, matching inference 
behaviour.
                 float[] logits = new float[numClasses];
                 for (int c = 0; c < numClasses; c++) {
                     float dot = biases[c];
                     for (int t = 0; t < nActive; t++) {
-                        dot += weights[c][touched[t]] * 
denseScratch[touched[t]];
+                        int b = touched[t];
+                        float contrib = weights[c][b] * denseScratch[b];
+                        dot += Math.max(-clip, Math.min(clip, contrib));
                     }
                     logits[c] = dot;
                 }
@@ -322,13 +306,20 @@ public class TrainCharsetModel {
                 grad[trueClass] -= 1f;
 
                 // Sparse SGD update with L2 regularization on both weights 
and biases.
+                // Straight-through estimator for the clip: pass the full 
gradient when
+                // the contribution was inside the clip window; only L2 decay 
when clipped.
                 for (int c = 0; c < numClasses; c++) {
                     float g = grad[c];
                     biases[c] -= lr * (g + lambda * biases[c]);
                     for (int t = 0; t < nActive; t++) {
                         int b = touched[t];
-                        weights[c][b] -= lr * (g * denseScratch[b]
-                                + lambda * weights[c][b]);
+                        float contrib = weights[c][b] * denseScratch[b];
+                        if (contrib > -clip && contrib < clip) {
+                            weights[c][b] -= lr * (g * denseScratch[b]
+                                    + lambda * weights[c][b]);
+                        } else {
+                            weights[c][b] -= lr * lambda * weights[c][b];
+                        }
                     }
                 }
                 count++;
@@ -434,7 +425,7 @@ public class TrainCharsetModel {
      */
     private static void evaluatePerCharset(
             LinearModel model,
-            FeatureExtractor<byte[]> extractor,
+            ByteNgramFeatureExtractor extractor,
             List<byte[]>[] samplesPerClass,
             String[] labels,
             int[][] groupIndices) {
diff --git 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
deleted file mode 100644
index 3958d86d81..0000000000
--- 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.chardetect;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-
-import org.junit.jupiter.api.Test;
-
-import 
org.apache.tika.ml.chardetect.tools.ConfigurableByteNgramFeatureExtractor;
-
-public class ConfigurableGlobalFeatureTest {
-
-    private static final int NUM_BUCKETS = 16384;
-    private static final int HASH_BUCKETS = NUM_BUCKETS
-            - ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT;
-
-    private static ConfigurableByteNgramFeatureExtractor withGlobals() {
-        return new ConfigurableByteNgramFeatureExtractor(
-                NUM_BUCKETS, true, true, false, false, true, true);
-    }
-
-    private static ConfigurableByteNgramFeatureExtractor withoutGlobals() {
-        return new ConfigurableByteNgramFeatureExtractor(
-                NUM_BUCKETS, true, true, false, false, true, false);
-    }
-
-    @Test
-    public void pureAsciiLandsInTopBin() {
-        assertEquals(5, ConfigurableByteNgramFeatureExtractor.asciiDensityBin(
-                
"BEGIN:VCARD\r\nVERSION:3.0\r\nEND:VCARD\r\n".getBytes(StandardCharsets.US_ASCII)));
-    }
-
-    @Test
-    public void sparseLatinVcardLandsInTopBin() {
-        // 99.4% ASCII: 3 high bytes in ~510 bytes of vCard text
-        byte[] probe = "BEGIN:VCARD\r\nN:M\u00FCller;Hans\r\nFN:Hans 
M\u00FCller\r\nADR:K\u00F6ln\r\nEND:VCARD\r\n"
-                .getBytes(StandardCharsets.ISO_8859_1);
-        int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(probe);
-        assertTrue(bin >= 4, "sparse-Latin vCard should land in bin 4 or 5, 
got: " + bin);
-    }
-
-    @Test
-    public void ebcdicTextLandsInLowBin() {
-        // Real EBCDIC: letters 0x81..0xE9 (~80%), 0x40 space (~20%)
-        // Under the ASCII-text bin definition, 0x40 IS printable ASCII ('@'),
-        // so EBCDIC lands in bin 1, not bin 0.  What matters is that it's
-        // cleanly separated from the plain-ASCII bin 5.
-        byte[] ebcdic = new byte[100];
-        int p = 0;
-        for (int i = 0; i < 20; i++) {
-            ebcdic[p++] = 0x40;  // space
-        }
-        for (int i = 0; i < 80; i++) {
-            ebcdic[p++] = (byte) (0x81 + (i % 9));  // letters
-        }
-        int bin = 
ConfigurableByteNgramFeatureExtractor.asciiDensityBin(ebcdic);
-        assertTrue(bin <= 2, "EBCDIC should land in bin 0-2, got: " + bin);
-        assertNotEquals(5, bin, "EBCDIC must not collide with the ASCII bin");
-    }
-
-    @Test
-    public void utf16LeEnglishLandsInMiddleBin() {
-        // UTF-16LE "Hello, world" — every other byte is 0x00
-        byte[] utf16 = "Hello, world! This is English text in UTF-16LE."
-                .getBytes(Charset.forName("UTF-16LE"));
-        int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16);
-        assertTrue(bin == 2, "UTF-16LE English should land in bin 2 (~50%), 
got: " + bin);
-    }
-
-    @Test
-    public void utf16LeBmpTextLandsInMidHighBin() {
-        // UTF-16LE of BMP text (Hiragana U+3040..U+309F etc.) — note that the
-        // "high byte of the codepoint" (0x30 here) is printable ASCII '0', and
-        // the "low byte" of most Hiragana falls in 0x40..0x9F — half 
printable.
-        // So UTF-16LE BMP text has a HIGH printable-ASCII-byte fraction 
despite
-        // not being ASCII text.  The global feature does not try to 
distinguish
-        // UTF-16 from ASCII — that's stride-2's job.  This test documents the
-        // observed behaviour so it isn't mistaken for a bug later.
-        byte[] utf16 = 
"\u6587\u7AE0\u3042\u3044\u3046\u3048\u304A\u304B\u304D\u304F"
-                .getBytes(Charset.forName("UTF-16LE"));
-        int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16);
-        assertTrue(bin >= 2, "UTF-16LE BMP text has many printable bytes, got 
bin: " + bin);
-    }
-
-    @Test
-    public void globalFeatureFiresExactlyOneTailSlot() {
-        ConfigurableByteNgramFeatureExtractor ext = withGlobals();
-        int[] dense = new int[NUM_BUCKETS];
-        int[] touched = new int[NUM_BUCKETS];
-
-        int n = ext.extractSparseInto(
-                "Plain ASCII text with no accents at 
all.".getBytes(StandardCharsets.US_ASCII),
-                dense, touched);
-
-        int tailFirings = 0;
-        int tailSlot = -1;
-        for (int i = 0; i < n; i++) {
-            if (touched[i] >= HASH_BUCKETS) {
-                tailFirings++;
-                tailSlot = touched[i];
-            }
-        }
-        assertEquals(1, tailFirings, "exactly one global tail slot must fire");
-        assertEquals(HASH_BUCKETS + 5, tailSlot, "pure ASCII should fire bin 
5");
-        assertEquals(1, dense[tailSlot], "count for global bin must be 1");
-    }
-
-    @Test
-    public void disablingGlobalsLeavesTailEmpty() {
-        ConfigurableByteNgramFeatureExtractor ext = withoutGlobals();
-        int[] dense = new int[NUM_BUCKETS];
-        int[] touched = new int[NUM_BUCKETS];
-
-        int n = ext.extractSparseInto(
-                "Plain ASCII text".getBytes(StandardCharsets.US_ASCII),
-                dense, touched);
-
-        for (int i = 0; i < n; i++) {
-            assertTrue(touched[i] < NUM_BUCKETS,
-                    "all firings must be in hash range when globals are off");
-        }
-    }
-
-    @Test
-    public void sparseAndDenseExtractionAgreeWithGlobals() {
-        ConfigurableByteNgramFeatureExtractor ext = withGlobals();
-        byte[] probe = "r\u00E9sum\u00E9 caf\u00E9 cr\u00E8me br\u00FBl\u00E9e"
-                .getBytes(StandardCharsets.ISO_8859_1);
-
-        int[] dense = ext.extract(probe);
-
-        int[] sparseDense = new int[NUM_BUCKETS];
-        int[] touched = new int[NUM_BUCKETS];
-        ext.extractSparseInto(probe, sparseDense, touched);
-
-        for (int i = 0; i < NUM_BUCKETS; i++) {
-            assertEquals(dense[i], sparseDense[i],
-                    "bucket " + i + " differs between dense and sparse paths");
-        }
-    }
-
-    // --- split-space layout ---
-
-    private static final int SPLIT_NUM_BUCKETS = 32768 + 
ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT;
-
-    private static ConfigurableByteNgramFeatureExtractor withSplitAndGlobals() 
{
-        return new ConfigurableByteNgramFeatureExtractor(
-                SPLIT_NUM_BUCKETS, true, true, false, false, true, true, true);
-    }
-
-    @Test
-    public void splitSpacesStride1FiresOnlyLowRegion() {
-        ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals();
-        int[] dense = new int[SPLIT_NUM_BUCKETS];
-        int[] touched = new int[SPLIT_NUM_BUCKETS];
-        // High bytes only — fires stride-1 unigrams + bigrams + stride-2 pairs
-        byte[] probe = new byte[]{(byte) 0xE4, (byte) 0xF6, (byte) 0xFC};
-        int n = ext.extractSparseInto(probe, dense, touched);
-
-        // stride-1 firings must be in [0, 16384), stride-2 in [16384, 32768),
-        // globals in [32768, 32774).
-        int stride1Count = 0;
-        int stride2Count = 0;
-        int globalCount = 0;
-        for (int i = 0; i < n; i++) {
-            int bkt = touched[i];
-            if (bkt < 16384) {
-                stride1Count++;
-            } else if (bkt < 32768) {
-                stride2Count++;
-            } else {
-                globalCount++;
-            }
-        }
-        assertTrue(stride1Count > 0, "expected stride-1 firings in low 
region");
-        assertTrue(stride2Count > 0, "expected stride-2 firings in high 
region");
-        assertEquals(1, globalCount, "exactly one global bin fires");
-    }
-
-    @Test
-    public void splitSpacesAsciiProbeFiresOnlyStride2AndGlobals() {
-        ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals();
-        int[] dense = new int[SPLIT_NUM_BUCKETS];
-        int[] touched = new int[SPLIT_NUM_BUCKETS];
-        // Pure ASCII — no stride-1 firings (no high bytes), all firings are
-        // stride-2 (HTML markup-shaped pairs) + the globals bin.
-        byte[] probe = "Hello, world! This is ASCII only.\r\n"
-                .getBytes(StandardCharsets.US_ASCII);
-        int n = ext.extractSparseInto(probe, dense, touched);
-
-        for (int i = 0; i < n; i++) {
-            int bkt = touched[i];
-            assertTrue(bkt >= 16384,
-                    "ASCII probe must NOT fire any stride-1 slot, got bkt=" + 
bkt);
-        }
-    }
-
-    @Test
-    public void splitSpacesDenseSparseAgree() {
-        ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals();
-        byte[] probe = "r\u00E9sum\u00E9 caf\u00E9"
-                .getBytes(StandardCharsets.ISO_8859_1);
-
-        int[] dense = ext.extract(probe);
-        int[] sparseDense = new int[SPLIT_NUM_BUCKETS];
-        int[] touched = new int[SPLIT_NUM_BUCKETS];
-        ext.extractSparseInto(probe, sparseDense, touched);
-
-        for (int i = 0; i < SPLIT_NUM_BUCKETS; i++) {
-            assertEquals(dense[i], sparseDense[i],
-                    "bucket " + i + " differs between dense and sparse paths 
(split layout)");
-        }
-    }
-}
diff --git 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java
 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java
deleted file mode 100644
index 900a5dbb5c..0000000000
--- 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.chardetect;
-
-import static org.junit.jupiter.api.Assertions.assertArrayEquals;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-
-import org.junit.jupiter.api.Test;
-
-import 
org.apache.tika.ml.chardetect.tools.ConfigurableByteNgramFeatureExtractor;
-
-/**
- * Verifies that the production {@link ByteNgramFeatureExtractor} and the
- * training-time {@link ConfigurableByteNgramFeatureExtractor} produce
- * identical feature vectors when configured with matching flags.
- *
- * <p>Training flags that match the production extractor:
- * {@code --no-tri} (trigrams off, which is the default-on flag turned off),
- * default {@code --no-anchored}, default {@code --stride2}.</p>
- *
- * <p>Also verifies that {@code extract()} and {@code extractSparseInto()}
- * agree within each extractor, since training uses the sparse path and
- * eval/inference uses the dense path.</p>
- */
-public class FeatureExtractorParityTest {
-
-    private static final int NUM_BUCKETS = 16384;
-
-    private final ByteNgramFeatureExtractor production =
-            new ByteNgramFeatureExtractor(NUM_BUCKETS);
-
-    private final ConfigurableByteNgramFeatureExtractor configurable =
-            new ConfigurableByteNgramFeatureExtractor(NUM_BUCKETS,
-                    true,   // unigrams
-                    true,   // bigrams
-                    false,  // trigrams OFF  (--no-tri)
-                    false,  // anchored OFF  (default)
-                    true);  // stride2  ON   (default)
-
-    // --- Cross-extractor parity: production.extract == configurable.extract 
---
-
-    @Test
-    public void parityOnPureAscii() {
-        assertParity("Hello, world! This is ASCII 
text.\r\n".getBytes(StandardCharsets.US_ASCII));
-    }
-
-    @Test
-    public void parityOnHighByteContent() {
-        // windows-1252 French: "résumé café"
-        assertParity(new byte[]{
-                (byte) 0x72, (byte) 0xE9, (byte) 0x73, (byte) 0x75,
-                (byte) 0x6D, (byte) 0xE9, (byte) 0x20,
-                (byte) 0x63, (byte) 0x61, (byte) 0x66, (byte) 0xE9
-        });
-    }
-
-    @Test
-    public void parityOnShiftJis() {
-        // Shift-JIS: lead 0x82, trail in 0x40-0x7E range
-        assertParity(new byte[]{
-                (byte) 0x82, (byte) 0x42, (byte) 0x82, (byte) 0x60,
-                (byte) 0x83, (byte) 0x41, (byte) 0x83, (byte) 0x5E
-        });
-    }
-
-    @Test
-    public void parityOnUtf16Le() {
-        // "ABCé" in UTF-16LE: 41 00 42 00 43 00 E9 00
-        assertParity(new byte[]{
-                (byte) 0x41, (byte) 0x00, (byte) 0x42, (byte) 0x00,
-                (byte) 0x43, (byte) 0x00, (byte) 0xE9, (byte) 0x00
-        });
-    }
-
-    @Test
-    public void parityOnUtf16Be() {
-        // "ABCé" in UTF-16BE: 00 41 00 42 00 43 00 E9
-        assertParity(new byte[]{
-                (byte) 0x00, (byte) 0x41, (byte) 0x00, (byte) 0x42,
-                (byte) 0x00, (byte) 0x43, (byte) 0x00, (byte) 0xE9
-        });
-    }
-
-    @Test
-    public void parityOnUtf32Le() {
-        // "AB" in UTF-32LE: 41 00 00 00 42 00 00 00
-        assertParity(new byte[]{
-                (byte) 0x41, (byte) 0x00, (byte) 0x00, (byte) 0x00,
-                (byte) 0x42, (byte) 0x00, (byte) 0x00, (byte) 0x00
-        });
-    }
-
-    @Test
-    public void parityOnUtf32Be() {
-        // "AB" in UTF-32BE: 00 00 00 41 00 00 00 42
-        assertParity(new byte[]{
-                (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x41,
-                (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x42
-        });
-    }
-
-    @Test
-    public void parityOnUtf32LeNonAscii() {
-        // U+0E01 (Thai ก) in UTF-32LE: 01 0E 00 00
-        // U+0E02 (Thai ข) in UTF-32LE: 02 0E 00 00
-        assertParity(new byte[]{
-                (byte) 0x01, (byte) 0x0E, (byte) 0x00, (byte) 0x00,
-                (byte) 0x02, (byte) 0x0E, (byte) 0x00, (byte) 0x00
-        });
-    }
-
-    @Test
-    public void parityOnUtf32BeNonAscii() {
-        // U+0E01 in UTF-32BE: 00 00 0E 01
-        // U+0E02 in UTF-32BE: 00 00 0E 02
-        assertParity(new byte[]{
-                (byte) 0x00, (byte) 0x00, (byte) 0x0E, (byte) 0x01,
-                (byte) 0x00, (byte) 0x00, (byte) 0x0E, (byte) 0x02
-        });
-    }
-
-    @Test
-    public void parityOnDenseHighBytes() {
-        // All high bytes: typical of KOI8-R or similar
-        byte[] dense = new byte[64];
-        for (int i = 0; i < dense.length; i++) {
-            dense[i] = (byte) (0xC0 + (i % 64));
-        }
-        assertParity(dense);
-    }
-
-    @Test
-    public void parityOnSingleByte() {
-        assertParity(new byte[]{(byte) 0xE0});
-    }
-
-    @Test
-    public void parityOnTwoBytes() {
-        assertParity(new byte[]{(byte) 0xE0, (byte) 0xE1});
-    }
-
-    @Test
-    public void parityOnEmpty() {
-        assertParity(new byte[0]);
-    }
-
-    @Test
-    public void parityOnRealUtf16Le() {
-        // Encode actual Unicode text as UTF-16LE to get a realistic probe
-        String text = "日本語テスト";  // Japanese
-        assertParity(text.getBytes(StandardCharsets.UTF_16LE));
-    }
-
-    @Test
-    public void parityOnRealUtf16Be() {
-        String text = "日本語テスト";
-        assertParity(text.getBytes(StandardCharsets.UTF_16BE));
-    }
-
-    @Test
-    public void parityOnRealUtf32() {
-        // UTF-32 via Charset.forName
-        Charset utf32le = Charset.forName("UTF-32LE");
-        Charset utf32be = Charset.forName("UTF-32BE");
-        String text = "Hello世界";
-        assertParity(text.getBytes(utf32le));
-        assertParity(text.getBytes(utf32be));
-    }
-
-    @Test
-    public void parityOnLongProbe() {
-        // 4096-byte probe mixing ASCII and high bytes
-        byte[] probe = new byte[4096];
-        for (int i = 0; i < probe.length; i++) {
-            probe[i] = (byte) ((i % 3 == 0) ? (0x80 + (i % 128)) : (0x20 + (i 
% 96)));
-        }
-        assertParity(probe);
-    }
-
-    // --- Internal consistency: extract() == extractSparseInto() within each 
extractor ---
-
-    @Test
-    public void productionDenseMatchesSparse() {
-        String text = "日本語テスト résumé";
-        byte[] probe = text.getBytes(StandardCharsets.UTF_16LE);
-        assertDenseSparseMatch(production, probe);
-    }
-
-    @Test
-    public void configurableDenseMatchesSparse() {
-        String text = "日本語テスト résumé";
-        byte[] probe = text.getBytes(StandardCharsets.UTF_16LE);
-
-        int[] dense = configurable.extract(probe);
-        int[] sparseDense = new int[NUM_BUCKETS];
-        int[] touched = new int[NUM_BUCKETS];
-        int n = configurable.extractSparseInto(probe, sparseDense, touched);
-
-        assertArrayEquals(dense, sparseDense,
-                "ConfigurableByteNgramFeatureExtractor: extract() vs 
extractSparseInto() differ");
-    }
-
-    // --- Helpers ---
-
-    private void assertParity(byte[] probe) {
-        int[] prodFeatures = production.extract(probe);
-        int[] confFeatures = configurable.extract(probe);
-
-        assertEquals(prodFeatures.length, confFeatures.length,
-                "Feature vector lengths differ");
-
-        // Find first mismatch for a useful error message
-        for (int i = 0; i < prodFeatures.length; i++) {
-            if (prodFeatures[i] != confFeatures[i]) {
-                StringBuilder sb = new StringBuilder();
-                sb.append(String.format(
-                        "Bucket %d: production=%d, configurable=%d. Probe (%d 
bytes): [",
-                        i, prodFeatures[i], confFeatures[i], probe.length));
-                int show = Math.min(probe.length, 32);
-                for (int j = 0; j < show; j++) {
-                    if (j > 0) sb.append(' ');
-                    sb.append(String.format("%02X", probe[j] & 0xFF));
-                }
-                if (probe.length > show) sb.append(" ...");
-                sb.append(']');
-                org.junit.jupiter.api.Assertions.fail(sb.toString());
-            }
-        }
-    }
-
-    private void assertDenseSparseMatch(ByteNgramFeatureExtractor ext, byte[] 
probe) {
-        int[] dense = ext.extract(probe);
-        int[] sparseDense = new int[NUM_BUCKETS];
-        int[] touched = new int[NUM_BUCKETS];
-        int n = ext.extractSparseInto(probe, sparseDense, touched);
-
-        assertArrayEquals(dense, sparseDense,
-                "ByteNgramFeatureExtractor: extract() vs extractSparseInto() 
differ");
-    }
-
-    // =====================================================================
-    // Parity in the split-spaces + globals layout (next-generation model).
-    // =====================================================================
-
-    private static final int SPLIT_NUM_BUCKETS =
-            32768 + ByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT;
-
-    private final ByteNgramFeatureExtractor productionSplit =
-            new ByteNgramFeatureExtractor(SPLIT_NUM_BUCKETS, true, true);
-
-    private final ConfigurableByteNgramFeatureExtractor configurableSplit =
-            new ConfigurableByteNgramFeatureExtractor(
-                    SPLIT_NUM_BUCKETS,
-                    true,   // unigrams
-                    true,   // bigrams
-                    false,  // trigrams OFF
-                    false,  // anchored OFF
-                    true,   // stride2 ON
-                    true,   // globals ON
-                    true);  // split spaces ON
-
-    private void assertSplitParity(byte[] probe) {
-        int[] prodFeatures = productionSplit.extract(probe);
-        int[] confFeatures = configurableSplit.extract(probe);
-        assertEquals(prodFeatures.length, confFeatures.length,
-                "split-layout feature vector lengths differ");
-        for (int i = 0; i < prodFeatures.length; i++) {
-            if (prodFeatures[i] != confFeatures[i]) {
-                org.junit.jupiter.api.Assertions.fail(String.format(
-                        "split-layout bucket %d: production=%d, 
configurable=%d",
-                        i, prodFeatures[i], confFeatures[i]));
-            }
-        }
-    }
-
-    @Test
-    public void splitParityOnPureAscii() {
-        assertSplitParity("Hello, world! This is ASCII text.\r\n"
-                .getBytes(StandardCharsets.US_ASCII));
-    }
-
-    @Test
-    public void splitParityOnHighByteContent() {
-        assertSplitParity(new byte[]{
-                (byte) 0x72, (byte) 0xE9, (byte) 0x73, (byte) 0x75,
-                (byte) 0x6D, (byte) 0xE9, (byte) 0x20,
-                (byte) 0x63, (byte) 0x61, (byte) 0x66, (byte) 0xE9
-        });
-    }
-
-    @Test
-    public void splitParityOnRealUtf16Le() {
-        assertSplitParity("日本語テスト".getBytes(StandardCharsets.UTF_16LE));
-    }
-
-    @Test
-    public void splitParityOnArabicLike() {
-        // Synthesized Arabic-style byte pattern: 0xC7/0xE1/0xE3 alef/lam/meem
-        byte[] probe = new byte[]{
-                (byte) 0xC7, (byte) 0xE1, (byte) 0xE3, 0x20,
-                (byte) 0xD9, (byte) 0xED, (byte) 0xC7, (byte) 0xE1,
-                (byte) 0xCA, (byte) 0xD1, 0x0D, 0x0A
-        };
-        assertSplitParity(probe);
-    }
-
-    @Test
-    public void splitParityOnLongMixedProbe() {
-        byte[] probe = new byte[4096];
-        for (int i = 0; i < probe.length; i++) {
-            probe[i] = (byte) ((i % 3 == 0) ? (0x80 + (i % 128)) : (0x20 + (i 
% 96)));
-        }
-        assertSplitParity(probe);
-    }
-
-    @Test
-    public void splitLayoutProductionDenseMatchesSparse() {
-        byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE);
-        int[] dense = productionSplit.extract(probe);
-        int[] sparseDense = new int[SPLIT_NUM_BUCKETS];
-        int[] touched = new int[SPLIT_NUM_BUCKETS];
-        productionSplit.extractSparseInto(probe, sparseDense, touched);
-        assertArrayEquals(dense, sparseDense,
-                "split layout: production extract() vs extractSparseInto() 
differ");
-    }
-
-    @Test
-    public void splitLayoutConfigurableDenseMatchesSparse() {
-        byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE);
-        int[] dense = configurableSplit.extract(probe);
-        int[] sparseDense = new int[SPLIT_NUM_BUCKETS];
-        int[] touched = new int[SPLIT_NUM_BUCKETS];
-        configurableSplit.extractSparseInto(probe, sparseDense, touched);
-        assertArrayEquals(dense, sparseDense,
-                "split layout: configurable extract() vs extractSparseInto() 
differ");
-    }
-}

(tika) 04/06: step 4

Reply via email to