This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch charset-ship-today in repository https://gitbox.apache.org/repos/asf/tika.git
commit ae11a89458bb33fe77675560ae3200224b6bac25 Author: tallison <[email protected]> AuthorDate: Fri Apr 17 09:16:41 2026 -0400 step 4 --- .../org/apache/tika/detect/AutoDetectReader.java | 9 +- .../org/apache/tika/detect/CharsetSupersets.java | 89 +++++ .../apache/tika/metadata/TikaCoreProperties.java | 12 + .../ml/chardetect/ByteNgramFeatureExtractor.java | 186 +++------ .../chardetect/tools/BuildCharsetTrainingData.java | 21 +- .../ConfigurableByteNgramFeatureExtractor.java | 416 --------------------- .../ml/chardetect/tools/EvalCharsetDetectors.java | 2 +- .../ml/chardetect/tools/TraceCharsetLogits.java | 15 +- .../ml/chardetect/tools/TrainCharsetModel.java | 125 +++---- .../chardetect/ConfigurableGlobalFeatureTest.java | 233 ------------ .../ml/chardetect/FeatureExtractorParityTest.java | 354 ------------------ 11 files changed, 235 insertions(+), 1227 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java index f306f69548..9e6c23297f 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java +++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java @@ -99,9 +99,14 @@ public class AutoDetectReader extends BufferedReader { // Ask all given detectors for the character encoding List<EncodingResult> results = detector.detect(tis, metadata, new ParseContext()); if (!results.isEmpty()) { - return results.get(0).getCharset(); + Charset detected = results.get(0).getCharset(); + Charset superset = CharsetSupersets.supersetOf(detected); + if (superset != null) { + metadata.set(TikaCoreProperties.DECODED_CHARSET, superset.name()); + return superset; + } + return detected; } - Charset charset = null; // Try determining the encoding based on hints in document metadata MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); diff --git a/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java new file mode 100644 index 0000000000..f53c98f847 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * Maps detected charsets to safer superset charsets for decoding. + * + * <p>When Tika detects a charset that is a strict subset of a broader encoding, + * it is safer to decode with the superset — the superset handles all byte + * sequences the subset can produce, plus the extension characters the subset + * cannot represent. Decoding with only the subset risks mojibake on any + * extension characters present in the document.</p> + * + * <p>Policy: Content-Type and detected-encoding metadata report the <em>detected</em> + * charset. Actual stream decoding uses the superset. The superset used is recorded + * in {@link org.apache.tika.metadata.TikaCoreProperties#DECODED_CHARSET}.</p> + * + * <h3>Superset map</h3> + * <ul> + * <li>EUC-KR → x-windows-949 (MS949 is a strict superset: all EUC-KR byte sequences + * decode identically, extension chars in x-windows-949 would mojibake under EUC-KR)</li> + * <li>Big5 → Big5-HKSCS (HKSCS adds Hong Kong Supplementary Characters)</li> + * <li>GB2312 → GB18030 (GB18030 is a strict superset of both GB2312 and GBK)</li> + * <li>GBK → GB18030 (GB18030 is a strict superset; enables 4-byte extension sequences)</li> + * <li>Shift_JIS → windows-31j (MS932 is a strict superset with NEC/IBM extensions)</li> + * </ul> + */ +public final class CharsetSupersets { + + /** + * Maps detected charset canonical names (case-sensitive, as returned by + * {@link Charset#name()}) to their superset charset canonical name. + */ + public static final Map<String, String> SUPERSET_MAP; + + static { + Map<String, String> m = new HashMap<>(); + m.put("EUC-KR", "x-windows-949"); + m.put("Big5", "Big5-HKSCS"); + m.put("GB2312", "GB18030"); + m.put("GBK", "GB18030"); + m.put("Shift_JIS", "windows-31j"); + SUPERSET_MAP = Collections.unmodifiableMap(m); + } + + private CharsetSupersets() { + } + + /** + * Returns the superset charset to use for decoding, or {@code null} if + * {@code detected} has no superset override. + * + * @param detected the charset returned by the encoding detector + * @return superset charset, or {@code null} if none is defined + */ + public static Charset supersetOf(Charset detected) { + if (detected == null) { + return null; + } + String supersetName = SUPERSET_MAP.get(detected.name()); + if (supersetName == null) { + return null; + } + try { + return Charset.forName(supersetName); + } catch (IllegalArgumentException e) { + return null; + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index 6d513a2a67..06e0ce4f2c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -437,6 +437,18 @@ public interface TikaCoreProperties { Property ENCODING_DETECTION_TRACE = Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace"); + /** + * The charset actually used to decode the stream when a superset override was applied. + * When the detected encoding (reported in Content-Type and {@link #DETECTED_ENCODING}) is + * a subset of a safer, broader charset (e.g. EUC-KR is a subset of x-windows-949, or + * GB2312 is a subset of GB18030), Tika decodes using the superset charset to avoid + * mojibake on extension characters. This field records the superset charset name so + * callers know which codec was actually used. Absent when detection and decoding use + * the same charset. + */ + Property DECODED_CHARSET = + Property.externalText(TIKA_META_PREFIX + "decodedCharset"); + /** * General metadata key for the count of non-final versions available within a file. This * was added initially to support generalizing incremental updates in PDF. diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java index baa67fbc47..fbdac3199d 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java @@ -22,30 +22,15 @@ import org.apache.tika.ml.FeatureExtractor; * Feature extractor for raw bytes for charset detection, using FNV-1a hashing * into a fixed-width bucket array. * - * <h3>Feature set (fixed — UB-AS)</h3> - * <p>This production extractor uses the feature set selected by grid search over - * the MadLAD-derived {@code charset-detect3} corpus (34 charsets, 3 runs × 6 - * configs × 3 bucket sizes, devtest accuracy averaged to reduce SGD noise): - * <strong>unigrams + bigrams + anchored bigrams + stride-2 bigrams</strong> - * (UB-AS), 16384 buckets.</p> + * <h3>Feature set (fixed — UB-A)</h3> + * <p>This production extractor emits <strong>high-byte-anchored unigrams, + * bigrams, and anchored bigrams</strong> plus a single ASCII-density global + * feature. The total feature-vector dimension is {@link #NUM_BUCKETS}.</p> * - * <p>Key findings from the ablation/grid search:</p> - * <ul> - * <li>Trigrams (T) added no accuracy over UB-AS and were dropped.</li> - * <li>Stride-2 bigrams (S) are the single most important new feature — - * they lifted overall accuracy from ~73% (old UBT- model without UTF-16/32 - * training) to ~95% by giving the model direct code-unit visibility into - * UTF-16/32 structure.</li> - * <li>Anchored bigrams (A) add ~0.04% at 16384 buckets — tiny but consistent.</li> - * <li>Accuracy plateau between 8192 and 32768 buckets is within SGD noise; - * 16384 chosen as the best size/accuracy trade-off.</li> - * </ul> - * - * <p>The feature flags are intentionally not configurable here — the shipped model + * <p>The feature flags are intentionally not configurable — the shipped model * was trained with exactly this configuration, and using any other combination - * at inference time would produce silently wrong predictions. - * For training new models with different feature combinations, use - * {@code ConfigurableByteNgramFeatureExtractor} in the training-tools module.</p> + * at inference time would produce silently wrong predictions. Design choices + * are tracked in git rather than at the command line.</p> * * <h3>Features emitted</h3> * <ul> @@ -64,25 +49,32 @@ import org.apache.tika.ml.FeatureExtractor; * cross-character boundary structure in Shift-JIS and Big5 where trail * bytes fall below 0x80 (0x40–0x7E). A distinct salt ({@code FNV_ANCHOR_SALT}) * prevents hash collisions with stride-1 bigrams.</li> - * <li><strong>Stride-2 bigrams</strong>: pairs {@code (b[i], b[i+1])} sampled - * at even positions {@code i = 0, 2, 4, ...}, covering all bytes (not just - * high bytes). These pairs directly reflect code-unit structure: UTF-16LE - * BMP text produces many {@code (XX, 0x00)} pairs; UTF-16BE produces - * {@code (0x00, XX)}. A distinct FNV salt ({@code FNV_STRIDE2_SALT}) - * prevents hash collisions with stride-1 features. The BOM must be - * stripped upstream before bytes reach this extractor so that offset 0 - * always aligns with a real code unit, matching the BOM-free training - * data.</li> + * <li><strong>ASCII-density global</strong>: exactly one of + * {@link #GLOBAL_FEATURE_COUNT} bins fires per probe, based on the + * fraction of bytes that are printable ASCII (see + * {@link #asciiDensityBin(byte[])}). Helps the model condition its + * Western-European vs CJK vs EBCDIC decision on overall probe shape.</li> * </ul> * - * <h3>Why the high-byte filter matters for stride-1 features</h3> + * <h3>UTF-16 detection is owned by the UTF-16 specialist</h3> + * <p>Stride-2 bigrams previously emitted here were the model's primary UTF-16 + * signal. They are no longer emitted: UTF-16 detection is now handled by + * {@code Utf16SpecialistEncodingDetector}, which uses column-aggregate byte- + * range features. That specialist correctly handles Latin, Cyrillic, Arabic, + * Hebrew, Indic, Thai, CJK Unified, and Hangul UTF-16 alike — including the + * CJK UTF-16 cases that a printable-ASCII-filtered stride-2 would have + * missed (common Chinese U+4E00–U+7EFF and hiragana U+3040–U+309F are + * frequently in the {@code [0x20, 0x7E]} range). Native multi-byte CJK + * (Shift_JIS / GB18030 / Big5 / EUC-*) is still discriminated here via + * high-byte-anchored bigrams — all CJK lead bytes are {@code >= 0x81}.</p> + * + * <h3>Why the high-byte filter matters</h3> * <p>Training data is clean text (no HTML tags). Inference data is often raw * HTML (many ASCII tag bytes). Without the filter, the model would see a * different byte distribution at inference time than at training time. By * ignoring bytes below 0x80 entirely for stride-1 features, HTML tags are * invisible to both the training and inference feature computation — no - * stripping needed. Stride-2 features intentionally include all bytes because - * the low bytes are the signal (e.g. the 0x00 high byte in UTF-16 BMP text).</p> + * stripping needed.</p> */ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { @@ -90,84 +82,37 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { private static final int FNV_OFFSET = 0x811c9dc5; /** Distinct salt for anchored bigrams (high→low boundary) — prevents collision with stride-1. */ private static final int FNV_ANCHOR_SALT = 0x27d4eb2f; - /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 hashes. */ - private static final int FNV_STRIDE2_SALT = 0x9e3779b9; + + /** Total feature-vector dimension used by the shipped model (including global slots). */ + public static final int NUM_BUCKETS = 16390; /** - * Number of reserved slots at the high end of the feature vector used for - * global (whole-probe) features when {@link #useGlobalFeatures} is enabled. - * Currently 6 slots hold ASCII-text-density bins (see - * {@link #asciiDensityBin(byte[])}). Must match the training-side - * {@code ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT}. + * Number of reserved slots at the high end of the feature vector for + * global (whole-probe) features. The last 6 slots hold ASCII-text-density + * bins (see {@link #asciiDensityBin(byte[])}). Always active. */ public static final int GLOBAL_FEATURE_COUNT = 6; private final int numBuckets; - private final int stride1Buckets; - private final int stride2Buckets; - private final int stride2Base; - private final int globalBase; - private final boolean useGlobalFeatures; - private final boolean useSplitSpaces; + private final int hashSpace; // numBuckets - GLOBAL_FEATURE_COUNT + private final int globalBase; // = hashSpace (first of 6 global slots) /** - * Legacy constructor: no globals, shared stride-1/stride-2 hash space. - * Matches the layout used by the shipped {@code chardetect-v6-no-utf32.bin}. - * - * @param numBuckets number of hash buckets (feature-vector dimension) + * @param numBuckets total feature-vector dimension, including the + * {@link #GLOBAL_FEATURE_COUNT} global slots at the end. */ public ByteNgramFeatureExtractor(int numBuckets) { - this(numBuckets, false, false); - } - - /** - * Create an extractor matching the layout of a trained model. - * - * @param numBuckets total feature-vector dimension. - * @param useGlobalFeatures reserve the last {@link #GLOBAL_FEATURE_COUNT} - * slots for ASCII-density bin features. - * @param useSplitSpaces split the hash space 50/50 between stride-1 - * features (low half) and stride-2 features - * (high half) so cross-family hash collisions - * cannot pollute single-byte-charset weights - * with stride-2 signals. - */ - public ByteNgramFeatureExtractor(int numBuckets, - boolean useGlobalFeatures, - boolean useSplitSpaces) { - if (numBuckets <= 0) { - throw new IllegalArgumentException("numBuckets must be positive: " + numBuckets); - } - int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0; - int hashSpace = numBuckets - globalsReserved; - if (hashSpace <= 0) { + if (numBuckets <= GLOBAL_FEATURE_COUNT) { throw new IllegalArgumentException( - "numBuckets must exceed GLOBAL_FEATURE_COUNT when useGlobalFeatures=true: " - + numBuckets); - } - if (useSplitSpaces && hashSpace < 2) { - throw new IllegalArgumentException( - "useSplitSpaces requires hashSpace >= 2: " + hashSpace); + "numBuckets must exceed GLOBAL_FEATURE_COUNT: " + numBuckets); } this.numBuckets = numBuckets; - this.useSplitSpaces = useSplitSpaces; - this.useGlobalFeatures = useGlobalFeatures; - if (useSplitSpaces) { - this.stride1Buckets = hashSpace / 2; - this.stride2Buckets = hashSpace - this.stride1Buckets; - this.stride2Base = this.stride1Buckets; - } else { - this.stride1Buckets = hashSpace; - this.stride2Buckets = hashSpace; - this.stride2Base = 0; - } + this.hashSpace = numBuckets - GLOBAL_FEATURE_COUNT; this.globalBase = hashSpace; } /** * Returns which ASCII-text-density bin this probe falls into, in [0, 6). - * Must match the training-side - * {@code ConfigurableByteNgramFeatureExtractor.asciiDensityBin}. * * <p>Bin layout (fraction of bytes that are ASCII-text: printable * {@code 0x20..0x7E} plus {@code 0x09 0x0A 0x0D}):</p> @@ -285,28 +230,12 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { } } - // Stride-2: code-unit pairs at positions 0, 2, 4, ... - // Covers all bytes (not just high bytes) so UTF-16 null bytes are visible. - for (int i = 0; i + 1 < input.length; i += 2) { - int b0 = input[i] & 0xFF; - int b1 = input[i + 1] & 0xFF; - int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; - h = (h ^ b1) * FNV_PRIME; - int bkt = stride2Bucket(h); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; - } - - // Global features: fire exactly one ASCII-density bin. - if (useGlobalFeatures) { - int bkt = globalBase + asciiDensityBin(input); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; + // Global feature: fire exactly one ASCII-density bin. + int bkt = globalBase + asciiDensityBin(input); + if (dense[bkt] == 0) { + touched[n++] = bkt; } + dense[bkt]++; return n; } @@ -332,29 +261,14 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { } } - // Stride-2 bigrams (same logic as extractSparseInto). - for (int i = from; i + 1 < to; i += 2) { - int b0 = b[i] & 0xFF; - int b1 = b[i + 1] & 0xFF; - int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; - h = (h ^ b1) * FNV_PRIME; - counts[stride2Bucket(h)]++; - } - - // Global features: fire exactly one ASCII-density bin. - if (useGlobalFeatures) { - byte[] slice = (from == 0 && to == b.length) - ? b : java.util.Arrays.copyOfRange(b, from, to); - counts[globalBase + asciiDensityBin(slice)]++; - } + // Global feature: fire exactly one ASCII-density bin. + byte[] slice = (from == 0 && to == b.length) + ? b : java.util.Arrays.copyOfRange(b, from, to); + counts[globalBase + asciiDensityBin(slice)]++; } private int stride1Bucket(int hash) { - return (hash & 0x7fffffff) % stride1Buckets; - } - - private int stride2Bucket(int hash) { - return stride2Base + (hash & 0x7fffffff) % stride2Buckets; + return (hash & 0x7fffffff) % hashSpace; } @Override @@ -382,6 +296,6 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { @Override public String toString() { return String.format(java.util.Locale.ROOT, - "ByteNgramFeatureExtractor{buckets=%d, UB-AS}", numBuckets); + "ByteNgramFeatureExtractor{buckets=%d, UB-A}", numBuckets); } } diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java index 07e5b524e5..afd5fb4b30 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java @@ -119,6 +119,7 @@ public class BuildCharsetTrainingData { CHARSET_JAVA.put("Shift_JIS", "Shift_JIS"); CHARSET_JAVA.put("EUC-JP", "EUC-JP"); CHARSET_JAVA.put("EUC-KR", "EUC-KR"); + CHARSET_JAVA.put("x-windows-949", "x-windows-949"); CHARSET_JAVA.put("GB18030", "GB18030"); CHARSET_JAVA.put("Big5-HKSCS", "Big5-HKSCS"); CHARSET_JAVA.put("x-EUC-TW", "x-EUC-TW"); @@ -153,7 +154,15 @@ public class BuildCharsetTrainingData { CHARSET_JAVA.put("IBM852", "IBM852"); // Mac Roman CHARSET_JAVA.put("x-MacRoman", "x-MacRoman"); - // EBCDIC + // EBCDIC — all variants are generated into the training corpus so a future + // EBCDIC specialist can be trained against them. Today's main SBCS model + // consumes only a subset of these (see TrainCharsetModel's hardcoded + // exclusion list): IBM424 (Hebrew) and IBM420 (Arabic) live entirely in + // the 0x41–0x6A range, below the 0x80 threshold our feature extractor + // considers, so excluding them from today's model avoids training on a + // signal the inference path cannot see; IBM1047 is byte-identical to + // IBM500 on most prose bytes and is excluded to avoid near-duplicate + // classes in the SBCS kitchen-sink model. CHARSET_JAVA.put("IBM500", "IBM500"); CHARSET_JAVA.put("IBM1047", "IBM1047"); CHARSET_JAVA.put("IBM424-ltr", "IBM424"); @@ -237,8 +246,11 @@ public class BuildCharsetTrainingData { put("jpn", "Shift_JIS", "EUC-JP", "ISO-2022-JP"); // Chinese (Simplified) put("zho", "GB18030", "ISO-2022-CN"); - // Korean - put("kor", "EUC-KR", "ISO-2022-KR"); + // Korean — x-windows-949 (MS949) is a strict superset of EUC-KR. + // Trained as a separate class so the model can discriminate MS949- + // extension-byte content from pure-EUC-KR content. Supersets at the + // decoder level (CharsetSupersets) decode EUC-KR output as MS949 anyway. + put("kor", "EUC-KR", "ISO-2022-KR", "x-windows-949"); // Thai put("tha", "windows-874"); // Traditional Chinese — sourced from Cantonese Wikipedia (yue) @@ -306,7 +318,8 @@ public class BuildCharsetTrainingData { * ASCII-range characters. */ private static final Set<String> HIGH_BYTE_CJK = new HashSet<>(Arrays.asList( - "Shift_JIS", "EUC-JP", "EUC-KR", "GB18030", "Big5-HKSCS", "x-EUC-TW" + "Shift_JIS", "EUC-JP", "EUC-KR", "x-windows-949", + "GB18030", "Big5-HKSCS", "x-EUC-TW" )); /** RTL charsets: text is reversed (character level) before encoding. */ diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java deleted file mode 100644 index 88469abab9..0000000000 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.ml.chardetect.tools; - -import org.apache.tika.ml.FeatureExtractor; - -/** - * Configurable byte n-gram feature extractor for use during training and - * ablation studies. - * - * <p>This class exposes all hyperparameters ({@code numBuckets}, feature flags) - * as constructor arguments so that training tools and annealing scripts can - * explore the full search space. It is intentionally kept out of the - * production {@code tika-encoding-detector-mojibuster} module — the shipped - * model was trained with fixed parameters (UBT-: unigrams + bigrams + trigrams, - * no anchored bigrams, 8192 buckets) which are hard-coded in the production - * {@link org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor}.</p> - * - * <p>Using this class at inference time against a model trained with different - * flags would produce silently wrong predictions.</p> - * - * <h3>Feature flags</h3> - * <ul> - * <li><b>useUnigrams</b>: emit one feature per high byte ({@code >= 0x80})</li> - * <li><b>useBigrams</b>: emit one feature per (high, next) byte pair</li> - * <li><b>useTrigrams</b>: emit one feature per (high, next, next+1) triple</li> - * <li><b>useAnchoredBigrams</b>: emit one feature per (low-trail, next) pair - * when the trail byte is {@code < 0x80} — captures cross-character - * boundaries in encodings like Shift-JIS and Big5 with low trail bytes</li> - * <li><b>useStride2Bigrams</b>: emit one feature per (b[i], b[i+1]) pair at - * even positions i = 0, 2, 4, ... covering all bytes (not just high bytes). - * A distinct FNV salt prevents hash collision with stride-1 bigrams. - * Helps the model distinguish UTF-16BE/LE via their characteristic - * code-unit patterns.</li> - * </ul> - */ -public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { - - private static final int FNV_PRIME = 0x01000193; - private static final int FNV_OFFSET = 0x811c9dc5; - private static final int FNV_ANCHOR_SALT = 0x27d4eb2f; - /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 hashes. */ - private static final int FNV_STRIDE2_SALT = 0x9e3779b9; - - /** - * Number of reserved slots at the high end of the feature vector used for - * global (whole-probe) features when {@link #useGlobalFeatures} is enabled. - * Currently 6 slots hold ASCII-low-byte density bins (see - * {@link #asciiDensityBin(byte[])}). - */ - public static final int GLOBAL_FEATURE_COUNT = 6; - - private final int numBuckets; - private final int stride1Buckets; // size of the stride-1 hash region - private final int stride2Buckets; // size of the stride-2 hash region - private final int stride2Base; // first slot of the stride-2 region - private final int globalBase; // first slot of the globals region (or numBuckets if disabled) - private final boolean useUnigrams; - private final boolean useBigrams; - private final boolean useTrigrams; - private final boolean useAnchoredBigrams; - private final boolean useStride2Bigrams; - private final boolean useGlobalFeatures; - private final boolean useSplitSpaces; - - /** - * Backwards-compatible constructor (no global features, no split spaces). - */ - public ConfigurableByteNgramFeatureExtractor(int numBuckets, - boolean useUnigrams, - boolean useBigrams, - boolean useTrigrams, - boolean useAnchoredBigrams, - boolean useStride2Bigrams) { - this(numBuckets, useUnigrams, useBigrams, useTrigrams, - useAnchoredBigrams, useStride2Bigrams, false); - } - - /** - * Constructor with globals support, shared hash space (stride-1 and stride-2 - * mod into the same bucket range). - */ - public ConfigurableByteNgramFeatureExtractor(int numBuckets, - boolean useUnigrams, - boolean useBigrams, - boolean useTrigrams, - boolean useAnchoredBigrams, - boolean useStride2Bigrams, - boolean useGlobalFeatures) { - this(numBuckets, useUnigrams, useBigrams, useTrigrams, - useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures, false); - } - - /** - * @param numBuckets total feature-vector dimension. When - * {@code useGlobalFeatures} is {@code true}, the - * last {@link #GLOBAL_FEATURE_COUNT} slots are - * reserved for global features. When - * {@code useSplitSpaces} is {@code true}, the - * remaining hash space is split 50/50 between - * stride-1 features and stride-2 features so - * HTML-shaped stride-2 emissions cannot collide - * with single-byte-charset stride-1 weights. - * @param useUnigrams emit unigram for each high byte - * @param useBigrams emit bigram anchored on each high byte - * @param useTrigrams emit trigram anchored on each high byte - * @param useAnchoredBigrams emit bigram anchored on each low trail byte - * @param useStride2Bigrams emit stride-2 bigrams at even positions (all bytes) - * @param useGlobalFeatures emit whole-probe global features into the - * reserved tail slots (ASCII-density bins) - * @param useSplitSpaces give stride-1 and stride-2 features disjoint - * bucket ranges - */ - public ConfigurableByteNgramFeatureExtractor(int numBuckets, - boolean useUnigrams, - boolean useBigrams, - boolean useTrigrams, - boolean useAnchoredBigrams, - boolean useStride2Bigrams, - boolean useGlobalFeatures, - boolean useSplitSpaces) { - if (numBuckets <= 0) { - throw new IllegalArgumentException("numBuckets must be positive: " + numBuckets); - } - int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0; - int hashSpace = numBuckets - globalsReserved; - if (hashSpace <= 0) { - throw new IllegalArgumentException( - "numBuckets must exceed GLOBAL_FEATURE_COUNT (" + GLOBAL_FEATURE_COUNT - + ") when useGlobalFeatures=true: " + numBuckets); - } - if (useSplitSpaces && hashSpace < 2) { - throw new IllegalArgumentException( - "useSplitSpaces requires hashSpace >= 2: " + hashSpace); - } - this.numBuckets = numBuckets; - this.useSplitSpaces = useSplitSpaces; - if (useSplitSpaces) { - // 50/50 split; stride-1 gets the first half, stride-2 gets the second. - this.stride1Buckets = hashSpace / 2; - this.stride2Buckets = hashSpace - this.stride1Buckets; - this.stride2Base = this.stride1Buckets; - } else { - // Both stride families share the same hash region [0, hashSpace). - this.stride1Buckets = hashSpace; - this.stride2Buckets = hashSpace; - this.stride2Base = 0; - } - // Globals region always starts immediately after the hash region(s). - this.globalBase = hashSpace; - this.useUnigrams = useUnigrams; - this.useBigrams = useBigrams; - this.useTrigrams = useTrigrams; - this.useAnchoredBigrams = useAnchoredBigrams; - this.useStride2Bigrams = useStride2Bigrams; - this.useGlobalFeatures = useGlobalFeatures; - } - - /** - * Returns which ASCII-text-density bin this probe falls into, in [0, 6). - * - * <p>Counts only <em>ASCII text bytes</em> — printable (0x20..0x7E) plus - * common whitespace (0x09 tab, 0x0A LF, 0x0D CR). NUL and other control - * bytes do <em>not</em> count. This matters because UTF-16LE/BE probes - * contain ~50% 0x00 bytes; if we counted those as "low", UTF-16 English - * would look like sparse Latin to the model, defeating the point of the - * feature. With the current definition, real UTF-16 English lands around - * bin 2-3 (half ASCII-letter bytes, half nulls), distinguishable from - * plain-ASCII probes (bin 5) and from real EBCDIC (bin 0-1).</p> - * - * <p>Bin layout (fraction of bytes that are ASCII-text):</p> - * <ul> - * <li>0: [0.00, 0.10) — effectively no ASCII text (real EBCDIC letters)</li> - * <li>1: [0.10, 0.50) — heavy non-ASCII content (CJK text, UTF-16 mixed)</li> - * <li>2: [0.50, 0.80) — text with dense foreign script, UTF-16 Latin</li> - * <li>3: [0.80, 0.95) — normal foreign-script text with ASCII markup</li> - * <li>4: [0.95, 0.99) — sparse-diacritic Western text</li> - * <li>5: [0.99, 1.00] — near-pure ASCII (vCards, config, scripts)</li> - * </ul> - */ - public static int asciiDensityBin(byte[] input) { - if (input == null || input.length == 0) { - return 5; - } - int asciiText = 0; - for (byte b : input) { - int v = b & 0xFF; - if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 0x0D) { - asciiText++; - } - } - double p = (double) asciiText / input.length; - if (p < 0.10) { - return 0; - } - if (p < 0.50) { - return 1; - } - if (p < 0.80) { - return 2; - } - if (p < 0.95) { - return 3; - } - if (p < 0.99) { - return 4; - } - return 5; - } - - @Override - public int[] extract(byte[] input) { - int[] counts = new int[numBuckets]; - if (input == null || input.length == 0) { - return counts; - } - extractInto(input, 0, input.length, counts); - return counts; - } - - /** - * Sparse extraction into caller-owned, reusable buffers. O(probe length). - * - * @param input raw bytes - * @param dense scratch buffer of length {@code numBuckets}, all-zeros on entry - * @param touched receives indices of non-zero buckets - * @return number of active entries written into {@code touched} - */ - public int extractSparseInto(byte[] input, int[] dense, int[] touched) { - if (input == null || input.length == 0) { - return 0; - } - int n = 0; - - // Stride-1: high-byte-anchored features. - for (int i = 0; i < input.length; i++) { - int bi = input[i] & 0xFF; - if (bi < 0x80) { - continue; - } - - if (useUnigrams) { - int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - int bkt = stride1Bucket(h); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; - } - - if (i + 1 < input.length) { - int bi1 = input[i + 1] & 0xFF; - - if (useBigrams) { - int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - h = (h ^ bi1) * FNV_PRIME; - int bkt = stride1Bucket(h); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; - } - - if (useAnchoredBigrams && bi1 < 0x80) { - int h = (FNV_ANCHOR_SALT ^ bi1) * FNV_PRIME; - if (i + 2 < input.length) { - h = (h ^ (input[i + 2] & 0xFF)) * FNV_PRIME; - } - int bkt = stride1Bucket(h); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; - } - - if (useTrigrams && i + 2 < input.length) { - int bi2 = input[i + 2] & 0xFF; - int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - h = (h ^ bi1) * FNV_PRIME; - h = (h ^ bi2) * FNV_PRIME; - int bkt = stride1Bucket(h); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; - } - } - } - - // Stride-2: code-unit pairs at positions 0, 2, 4, ... - if (useStride2Bigrams) { - for (int i = 0; i + 1 < input.length; i += 2) { - int b0 = input[i] & 0xFF; - int b1 = input[i + 1] & 0xFF; - int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; - h = (h ^ b1) * FNV_PRIME; - int bkt = stride2Bucket(h); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; - } - } - - // Global features at reserved tail slots: fire exactly one ASCII-density bin. - if (useGlobalFeatures) { - int bkt = globalBase + asciiDensityBin(input); - if (dense[bkt] == 0) { - touched[n++] = bkt; - } - dense[bkt]++; - } - - return n; - } - - private void extractInto(byte[] b, int from, int to, int[] counts) { - // Stride-1: high-byte-anchored features. - for (int i = from; i < to; i++) { - int bi = b[i] & 0xFF; - if (bi < 0x80) { - continue; - } - - if (useUnigrams) { - counts[stride1Bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++; - } - - if (i + 1 < to) { - int bi1 = b[i + 1] & 0xFF; - - if (useBigrams) { - int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - h = (h ^ bi1) * FNV_PRIME; - counts[stride1Bucket(h)]++; - } - - if (useAnchoredBigrams && bi1 < 0x80) { - int h = (FNV_ANCHOR_SALT ^ bi1) * FNV_PRIME; - if (i + 2 < to) { - h = (h ^ (b[i + 2] & 0xFF)) * FNV_PRIME; - } - counts[stride1Bucket(h)]++; - } - - if (useTrigrams && i + 2 < to) { - int bi2 = b[i + 2] & 0xFF; - int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - h = (h ^ bi1) * FNV_PRIME; - h = (h ^ bi2) * FNV_PRIME; - counts[stride1Bucket(h)]++; - } - } - } - - // Stride-2 bigrams (same logic as extractSparseInto). - if (useStride2Bigrams) { - for (int i = from; i + 1 < to; i += 2) { - int b0 = b[i] & 0xFF; - int b1 = b[i + 1] & 0xFF; - int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; - h = (h ^ b1) * FNV_PRIME; - counts[stride2Bucket(h)]++; - } - } - - // Global features at reserved tail slots: fire exactly one ASCII-density bin. - if (useGlobalFeatures) { - byte[] slice = (from == 0 && to == b.length) - ? b : java.util.Arrays.copyOfRange(b, from, to); - counts[globalBase + asciiDensityBin(slice)]++; - } - } - - private int stride1Bucket(int hash) { - return (hash & 0x7fffffff) % stride1Buckets; - } - - private int stride2Bucket(int hash) { - return stride2Base + (hash & 0x7fffffff) % stride2Buckets; - } - - @Override - public int getNumBuckets() { - return numBuckets; - } - - public boolean isUseSplitSpaces() { - return useSplitSpaces; - } - - @Override - public String toString() { - return String.format(java.util.Locale.ROOT, - "ConfigurableByteNgramFeatureExtractor{buckets=%d, stride1=[0,%d) stride2=[%d,%d) globals=[%d,%d)" - + " uni=%b, bi=%b, tri=%b, anchored=%b, stride2f=%b, globalsf=%b, split=%b}", - numBuckets, stride1Buckets, stride2Base, stride2Base + stride2Buckets, - globalBase, numBuckets, - useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, - useStride2Bigrams, useGlobalFeatures, useSplitSpaces); - } -} diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java index 5ca57b1669..eca49bc1c9 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java @@ -73,7 +73,7 @@ public class EvalCharsetDetectors { private static final double OOV_THRESHOLD_CJK = 0.80; private static final double OOV_THRESHOLD_SBCS = 0.98; private static final Set<String> CJK_CHARSETS = Set.of( - "Big5", "Big5-HKSCS", "EUC-JP", "EUC-KR", "EUC-TW", + "Big5", "Big5-HKSCS", "EUC-JP", "EUC-KR", "EUC-TW", "x-windows-949", "GB18030", "GB2312", "GBK", "Shift_JIS" ); private static final Set<String> OOV_EXEMPT = Set.of( diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java index dfe13b3ade..4a749ad124 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java @@ -53,8 +53,6 @@ public final class TraceCharsetLogits { List<String> focus = new ArrayList<>(); int topBuckets = 20; int maxProbeBytes = 32 * 1024; - boolean noStride2 = false; - for (int i = 0; i < args.length; i++) { switch (args[i]) { case "--probe": @@ -74,9 +72,6 @@ public final class TraceCharsetLogits { case "--max-probe-bytes": maxProbeBytes = Integer.parseInt(args[++i]); break; - case "--no-stride2": - noStride2 = true; - break; default: System.err.println("Unknown arg: " + args[i]); System.exit(1); @@ -89,15 +84,7 @@ public final class TraceCharsetLogits { } LinearModel model = loadModel(modelPath); - FeatureExtractor<byte[]> extractor = noStride2 - // Production flags minus stride-2, matching FeatureExtractorParityTest - // for the stride-1 features (uni + bi, no trigrams, no anchored). - ? new ConfigurableByteNgramFeatureExtractor(model.getNumBuckets(), - true, true, false, false, false) - : new ByteNgramFeatureExtractor(model.getNumBuckets()); - if (noStride2) { - System.out.println("Stride-2 features suppressed for this run."); - } + FeatureExtractor<byte[]> extractor = new ByteNgramFeatureExtractor(model.getNumBuckets()); byte[] allBytes = Files.readAllBytes(probePath); byte[] probe = allBytes.length <= maxProbeBytes diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java index 1e7a7e5cdf..d7379f4c8b 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java @@ -35,8 +35,8 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; -import org.apache.tika.ml.FeatureExtractor; import org.apache.tika.ml.LinearModel; +import org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor; import org.apache.tika.ml.chardetect.CharsetConfusables; /** @@ -64,11 +64,43 @@ import org.apache.tika.ml.chardetect.CharsetConfusables; */ public class TrainCharsetModel { - private static final int DEFAULT_NUM_BUCKETS = 16384; + private static final int DEFAULT_NUM_BUCKETS = ByteNgramFeatureExtractor.NUM_BUCKETS; private static final int DEFAULT_EPOCHS = 3; private static final float DEFAULT_LR = 0.05f; private static final int DEFAULT_MAX_SAMPLES = 500_000; + /** + * Labels excluded from the main SBCS "kitchen-sink" model by default. + * + * <p>Hardcoded here (rather than passed on the command line) so the model's + * class set is versioned in git alongside the code that uses it — past + * retraining runs with inconsistent CLI flags were a recurring source of + * mismatched inference/training feature sets.</p> + * + * <p>{@link BuildCharsetTrainingData} still generates training corpora for + * these labels — they are needed by future specialists (e.g. an EBCDIC + * specialist) — but the main SBCS model doesn't consume them today:</p> + * <ul> + * <li><b>IBM424-ltr/rtl</b> (Hebrew EBCDIC) — content bytes occupy 0x41–0x6A, + * entirely below the 0x80 threshold the shipped + * {@link ByteNgramFeatureExtractor} considers. Training on these + * labels teaches weights the inference path cannot ever match.</li> + * <li><b>IBM420-ltr/rtl</b> (Arabic EBCDIC) — same reason as IBM424.</li> + * <li><b>IBM1047</b> (z/OS Unix System Services Latin-1) — byte-identical + * to IBM500 on most prose; having both as classes just splits the + * EBCDIC-Latin signal without adding discrimination the model can + * use.</li> + * </ul> + * + * <p>CLI {@code --exclude} is unioned with this set, not replaced, so an + * operator can add further exclusions but cannot accidentally suppress + * the hardcoded policy.</p> + */ + static final Set<String> TODAY_SBCS_EXCLUDE = Set.of( + "IBM424-ltr", "IBM424-rtl", + "IBM420-ltr", "IBM420-rtl", + "IBM1047"); + public static void main(String[] args) throws IOException { Path dataDir = null; Path outputPath = Paths.get("chardetect.bin"); @@ -76,17 +108,12 @@ public class TrainCharsetModel { int epochs = DEFAULT_EPOCHS; float lr = DEFAULT_LR; int maxSamplesPerClass = DEFAULT_MAX_SAMPLES; - boolean useUnigrams = true; - boolean useBigrams = true; - boolean useTrigrams = true; - boolean useAnchoredBigrams = false; - boolean useStride2Bigrams = true; - boolean useGlobalFeatures = false; - boolean useSplitSpaces = false; // --label-remap src1:dst1,src2:dst2 — merges multiple source labels into // one target label at training time (e.g. merge script variants into one class). Map<String, String> labelRemap = new HashMap<>(); - Set<String> excludeLabels = new java.util.HashSet<>(); + // Start from the hardcoded SBCS-kitchen-sink exclusion list; CLI + // --exclude adds to it but cannot override. + Set<String> excludeLabels = new java.util.HashSet<>(TODAY_SBCS_EXCLUDE); for (int i = 0; i < args.length; i++) { switch (args[i]) { @@ -118,42 +145,6 @@ public class TrainCharsetModel { labelRemap.put(kv[0].trim(), kv[1].trim()); } break; - case "--no-uni": - useUnigrams = false; - break; - case "--no-bi": - useBigrams = false; - break; - case "--tri": - useTrigrams = true; - break; - case "--no-tri": - useTrigrams = false; - break; - case "--anchored": - useAnchoredBigrams = true; - break; - case "--no-anchored": - useAnchoredBigrams = false; - break; - case "--stride2": - useStride2Bigrams = true; - break; - case "--no-stride2": - useStride2Bigrams = false; - break; - case "--globals": - useGlobalFeatures = true; - break; - case "--no-globals": - useGlobalFeatures = false; - break; - case "--split-spaces": - useSplitSpaces = true; - break; - case "--no-split-spaces": - useSplitSpaces = false; - break; case "--exclude": for (String label : args[++i].split(",")) { excludeLabels.add(label.trim()); @@ -173,14 +164,8 @@ public class TrainCharsetModel { System.err.println(" --max-samples-per-class N"); System.err.println(" --label-remap src1:dst1,src2:dst2"); System.err.println(" merge source labels into a single target label"); - System.err.println(" --no-uni disable unigram features"); - System.err.println(" --no-bi disable bigram features"); - System.err.println(" --tri / --no-tri enable/disable trigram features (default: on)"); - System.err.println(" --anchored / --no-anchored anchored bigrams (default: off)"); - System.err.println(" --stride2 / --no-stride2 stride-2 bigrams at even positions (default: on)"); - System.err.println(" --globals / --no-globals emit global ASCII-density bin features (default: off)"); - System.err.println(" --split-spaces / --no-split-spaces give stride-1 and stride-2 features disjoint bucket ranges (default: off)"); - System.err.println(" --exclude cs1,cs2 skip these charset labels (e.g. UTF-32-BE,UTF-32-LE)"); + System.err.println(" --exclude cs1,cs2 ADD these to the hardcoded exclusion list " + + TODAY_SBCS_EXCLUDE); System.exit(1); } @@ -226,15 +211,8 @@ public class TrainCharsetModel { System.out.printf(java.util.Locale.ROOT, "Buckets: %d epochs: %d lr: %.4f max-samples/class: %d%n", numBuckets, epochs, lr, maxSamplesPerClass); - System.out.printf(java.util.Locale.ROOT, - "Features: uni=%b bi=%b tri=%b anchored=%b stride2=%b globals=%b split=%b%n", - useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, useStride2Bigrams, - useGlobalFeatures, useSplitSpaces); - ConfigurableByteNgramFeatureExtractor extractor = - new ConfigurableByteNgramFeatureExtractor(numBuckets, - useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, - useStride2Bigrams, useGlobalFeatures, useSplitSpaces); + ByteNgramFeatureExtractor extractor = new ByteNgramFeatureExtractor(numBuckets); // Build class index map Map<String, Integer> labelIndex = new HashMap<>(); @@ -299,12 +277,18 @@ public class TrainCharsetModel { // Sparse extraction: O(probeLength), not O(numBuckets) int nActive = extractor.extractSparseInto(sample, denseScratch, touched); - // Forward pass: only iterate active buckets + // Per-bucket contribution clip matching LinearModel.predictLogits at inference. + // Prevents any single colliding bucket from dominating the logit. + float clip = 1.5f * (float) Math.sqrt(nActive); + + // Forward pass: clipped contributions, matching inference behaviour. float[] logits = new float[numClasses]; for (int c = 0; c < numClasses; c++) { float dot = biases[c]; for (int t = 0; t < nActive; t++) { - dot += weights[c][touched[t]] * denseScratch[touched[t]]; + int b = touched[t]; + float contrib = weights[c][b] * denseScratch[b]; + dot += Math.max(-clip, Math.min(clip, contrib)); } logits[c] = dot; } @@ -322,13 +306,20 @@ public class TrainCharsetModel { grad[trueClass] -= 1f; // Sparse SGD update with L2 regularization on both weights and biases. + // Straight-through estimator for the clip: pass the full gradient when + // the contribution was inside the clip window; only L2 decay when clipped. for (int c = 0; c < numClasses; c++) { float g = grad[c]; biases[c] -= lr * (g + lambda * biases[c]); for (int t = 0; t < nActive; t++) { int b = touched[t]; - weights[c][b] -= lr * (g * denseScratch[b] - + lambda * weights[c][b]); + float contrib = weights[c][b] * denseScratch[b]; + if (contrib > -clip && contrib < clip) { + weights[c][b] -= lr * (g * denseScratch[b] + + lambda * weights[c][b]); + } else { + weights[c][b] -= lr * lambda * weights[c][b]; + } } } count++; @@ -434,7 +425,7 @@ public class TrainCharsetModel { */ private static void evaluatePerCharset( LinearModel model, - FeatureExtractor<byte[]> extractor, + ByteNgramFeatureExtractor extractor, List<byte[]>[] samplesPerClass, String[] labels, int[][] groupIndices) { diff --git a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java deleted file mode 100644 index 3958d86d81..0000000000 --- a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.ml.chardetect; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Test; - -import org.apache.tika.ml.chardetect.tools.ConfigurableByteNgramFeatureExtractor; - -public class ConfigurableGlobalFeatureTest { - - private static final int NUM_BUCKETS = 16384; - private static final int HASH_BUCKETS = NUM_BUCKETS - - ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT; - - private static ConfigurableByteNgramFeatureExtractor withGlobals() { - return new ConfigurableByteNgramFeatureExtractor( - NUM_BUCKETS, true, true, false, false, true, true); - } - - private static ConfigurableByteNgramFeatureExtractor withoutGlobals() { - return new ConfigurableByteNgramFeatureExtractor( - NUM_BUCKETS, true, true, false, false, true, false); - } - - @Test - public void pureAsciiLandsInTopBin() { - assertEquals(5, ConfigurableByteNgramFeatureExtractor.asciiDensityBin( - "BEGIN:VCARD\r\nVERSION:3.0\r\nEND:VCARD\r\n".getBytes(StandardCharsets.US_ASCII))); - } - - @Test - public void sparseLatinVcardLandsInTopBin() { - // 99.4% ASCII: 3 high bytes in ~510 bytes of vCard text - byte[] probe = "BEGIN:VCARD\r\nN:M\u00FCller;Hans\r\nFN:Hans M\u00FCller\r\nADR:K\u00F6ln\r\nEND:VCARD\r\n" - .getBytes(StandardCharsets.ISO_8859_1); - int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(probe); - assertTrue(bin >= 4, "sparse-Latin vCard should land in bin 4 or 5, got: " + bin); - } - - @Test - public void ebcdicTextLandsInLowBin() { - // Real EBCDIC: letters 0x81..0xE9 (~80%), 0x40 space (~20%) - // Under the ASCII-text bin definition, 0x40 IS printable ASCII ('@'), - // so EBCDIC lands in bin 1, not bin 0. What matters is that it's - // cleanly separated from the plain-ASCII bin 5. - byte[] ebcdic = new byte[100]; - int p = 0; - for (int i = 0; i < 20; i++) { - ebcdic[p++] = 0x40; // space - } - for (int i = 0; i < 80; i++) { - ebcdic[p++] = (byte) (0x81 + (i % 9)); // letters - } - int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(ebcdic); - assertTrue(bin <= 2, "EBCDIC should land in bin 0-2, got: " + bin); - assertNotEquals(5, bin, "EBCDIC must not collide with the ASCII bin"); - } - - @Test - public void utf16LeEnglishLandsInMiddleBin() { - // UTF-16LE "Hello, world" — every other byte is 0x00 - byte[] utf16 = "Hello, world! This is English text in UTF-16LE." - .getBytes(Charset.forName("UTF-16LE")); - int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16); - assertTrue(bin == 2, "UTF-16LE English should land in bin 2 (~50%), got: " + bin); - } - - @Test - public void utf16LeBmpTextLandsInMidHighBin() { - // UTF-16LE of BMP text (Hiragana U+3040..U+309F etc.) — note that the - // "high byte of the codepoint" (0x30 here) is printable ASCII '0', and - // the "low byte" of most Hiragana falls in 0x40..0x9F — half printable. - // So UTF-16LE BMP text has a HIGH printable-ASCII-byte fraction despite - // not being ASCII text. The global feature does not try to distinguish - // UTF-16 from ASCII — that's stride-2's job. This test documents the - // observed behaviour so it isn't mistaken for a bug later. - byte[] utf16 = "\u6587\u7AE0\u3042\u3044\u3046\u3048\u304A\u304B\u304D\u304F" - .getBytes(Charset.forName("UTF-16LE")); - int bin = ConfigurableByteNgramFeatureExtractor.asciiDensityBin(utf16); - assertTrue(bin >= 2, "UTF-16LE BMP text has many printable bytes, got bin: " + bin); - } - - @Test - public void globalFeatureFiresExactlyOneTailSlot() { - ConfigurableByteNgramFeatureExtractor ext = withGlobals(); - int[] dense = new int[NUM_BUCKETS]; - int[] touched = new int[NUM_BUCKETS]; - - int n = ext.extractSparseInto( - "Plain ASCII text with no accents at all.".getBytes(StandardCharsets.US_ASCII), - dense, touched); - - int tailFirings = 0; - int tailSlot = -1; - for (int i = 0; i < n; i++) { - if (touched[i] >= HASH_BUCKETS) { - tailFirings++; - tailSlot = touched[i]; - } - } - assertEquals(1, tailFirings, "exactly one global tail slot must fire"); - assertEquals(HASH_BUCKETS + 5, tailSlot, "pure ASCII should fire bin 5"); - assertEquals(1, dense[tailSlot], "count for global bin must be 1"); - } - - @Test - public void disablingGlobalsLeavesTailEmpty() { - ConfigurableByteNgramFeatureExtractor ext = withoutGlobals(); - int[] dense = new int[NUM_BUCKETS]; - int[] touched = new int[NUM_BUCKETS]; - - int n = ext.extractSparseInto( - "Plain ASCII text".getBytes(StandardCharsets.US_ASCII), - dense, touched); - - for (int i = 0; i < n; i++) { - assertTrue(touched[i] < NUM_BUCKETS, - "all firings must be in hash range when globals are off"); - } - } - - @Test - public void sparseAndDenseExtractionAgreeWithGlobals() { - ConfigurableByteNgramFeatureExtractor ext = withGlobals(); - byte[] probe = "r\u00E9sum\u00E9 caf\u00E9 cr\u00E8me br\u00FBl\u00E9e" - .getBytes(StandardCharsets.ISO_8859_1); - - int[] dense = ext.extract(probe); - - int[] sparseDense = new int[NUM_BUCKETS]; - int[] touched = new int[NUM_BUCKETS]; - ext.extractSparseInto(probe, sparseDense, touched); - - for (int i = 0; i < NUM_BUCKETS; i++) { - assertEquals(dense[i], sparseDense[i], - "bucket " + i + " differs between dense and sparse paths"); - } - } - - // --- split-space layout --- - - private static final int SPLIT_NUM_BUCKETS = 32768 + ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT; - - private static ConfigurableByteNgramFeatureExtractor withSplitAndGlobals() { - return new ConfigurableByteNgramFeatureExtractor( - SPLIT_NUM_BUCKETS, true, true, false, false, true, true, true); - } - - @Test - public void splitSpacesStride1FiresOnlyLowRegion() { - ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals(); - int[] dense = new int[SPLIT_NUM_BUCKETS]; - int[] touched = new int[SPLIT_NUM_BUCKETS]; - // High bytes only — fires stride-1 unigrams + bigrams + stride-2 pairs - byte[] probe = new byte[]{(byte) 0xE4, (byte) 0xF6, (byte) 0xFC}; - int n = ext.extractSparseInto(probe, dense, touched); - - // stride-1 firings must be in [0, 16384), stride-2 in [16384, 32768), - // globals in [32768, 32774). - int stride1Count = 0; - int stride2Count = 0; - int globalCount = 0; - for (int i = 0; i < n; i++) { - int bkt = touched[i]; - if (bkt < 16384) { - stride1Count++; - } else if (bkt < 32768) { - stride2Count++; - } else { - globalCount++; - } - } - assertTrue(stride1Count > 0, "expected stride-1 firings in low region"); - assertTrue(stride2Count > 0, "expected stride-2 firings in high region"); - assertEquals(1, globalCount, "exactly one global bin fires"); - } - - @Test - public void splitSpacesAsciiProbeFiresOnlyStride2AndGlobals() { - ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals(); - int[] dense = new int[SPLIT_NUM_BUCKETS]; - int[] touched = new int[SPLIT_NUM_BUCKETS]; - // Pure ASCII — no stride-1 firings (no high bytes), all firings are - // stride-2 (HTML markup-shaped pairs) + the globals bin. - byte[] probe = "Hello, world! This is ASCII only.\r\n" - .getBytes(StandardCharsets.US_ASCII); - int n = ext.extractSparseInto(probe, dense, touched); - - for (int i = 0; i < n; i++) { - int bkt = touched[i]; - assertTrue(bkt >= 16384, - "ASCII probe must NOT fire any stride-1 slot, got bkt=" + bkt); - } - } - - @Test - public void splitSpacesDenseSparseAgree() { - ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals(); - byte[] probe = "r\u00E9sum\u00E9 caf\u00E9" - .getBytes(StandardCharsets.ISO_8859_1); - - int[] dense = ext.extract(probe); - int[] sparseDense = new int[SPLIT_NUM_BUCKETS]; - int[] touched = new int[SPLIT_NUM_BUCKETS]; - ext.extractSparseInto(probe, sparseDense, touched); - - for (int i = 0; i < SPLIT_NUM_BUCKETS; i++) { - assertEquals(dense[i], sparseDense[i], - "bucket " + i + " differs between dense and sparse paths (split layout)"); - } - } -} diff --git a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java deleted file mode 100644 index 900a5dbb5c..0000000000 --- a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.ml.chardetect; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Test; - -import org.apache.tika.ml.chardetect.tools.ConfigurableByteNgramFeatureExtractor; - -/** - * Verifies that the production {@link ByteNgramFeatureExtractor} and the - * training-time {@link ConfigurableByteNgramFeatureExtractor} produce - * identical feature vectors when configured with matching flags. - * - * <p>Training flags that match the production extractor: - * {@code --no-tri} (trigrams off, which is the default-on flag turned off), - * default {@code --no-anchored}, default {@code --stride2}.</p> - * - * <p>Also verifies that {@code extract()} and {@code extractSparseInto()} - * agree within each extractor, since training uses the sparse path and - * eval/inference uses the dense path.</p> - */ -public class FeatureExtractorParityTest { - - private static final int NUM_BUCKETS = 16384; - - private final ByteNgramFeatureExtractor production = - new ByteNgramFeatureExtractor(NUM_BUCKETS); - - private final ConfigurableByteNgramFeatureExtractor configurable = - new ConfigurableByteNgramFeatureExtractor(NUM_BUCKETS, - true, // unigrams - true, // bigrams - false, // trigrams OFF (--no-tri) - false, // anchored OFF (default) - true); // stride2 ON (default) - - // --- Cross-extractor parity: production.extract == configurable.extract --- - - @Test - public void parityOnPureAscii() { - assertParity("Hello, world! This is ASCII text.\r\n".getBytes(StandardCharsets.US_ASCII)); - } - - @Test - public void parityOnHighByteContent() { - // windows-1252 French: "résumé café" - assertParity(new byte[]{ - (byte) 0x72, (byte) 0xE9, (byte) 0x73, (byte) 0x75, - (byte) 0x6D, (byte) 0xE9, (byte) 0x20, - (byte) 0x63, (byte) 0x61, (byte) 0x66, (byte) 0xE9 - }); - } - - @Test - public void parityOnShiftJis() { - // Shift-JIS: lead 0x82, trail in 0x40-0x7E range - assertParity(new byte[]{ - (byte) 0x82, (byte) 0x42, (byte) 0x82, (byte) 0x60, - (byte) 0x83, (byte) 0x41, (byte) 0x83, (byte) 0x5E - }); - } - - @Test - public void parityOnUtf16Le() { - // "ABCé" in UTF-16LE: 41 00 42 00 43 00 E9 00 - assertParity(new byte[]{ - (byte) 0x41, (byte) 0x00, (byte) 0x42, (byte) 0x00, - (byte) 0x43, (byte) 0x00, (byte) 0xE9, (byte) 0x00 - }); - } - - @Test - public void parityOnUtf16Be() { - // "ABCé" in UTF-16BE: 00 41 00 42 00 43 00 E9 - assertParity(new byte[]{ - (byte) 0x00, (byte) 0x41, (byte) 0x00, (byte) 0x42, - (byte) 0x00, (byte) 0x43, (byte) 0x00, (byte) 0xE9 - }); - } - - @Test - public void parityOnUtf32Le() { - // "AB" in UTF-32LE: 41 00 00 00 42 00 00 00 - assertParity(new byte[]{ - (byte) 0x41, (byte) 0x00, (byte) 0x00, (byte) 0x00, - (byte) 0x42, (byte) 0x00, (byte) 0x00, (byte) 0x00 - }); - } - - @Test - public void parityOnUtf32Be() { - // "AB" in UTF-32BE: 00 00 00 41 00 00 00 42 - assertParity(new byte[]{ - (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x41, - (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x42 - }); - } - - @Test - public void parityOnUtf32LeNonAscii() { - // U+0E01 (Thai ก) in UTF-32LE: 01 0E 00 00 - // U+0E02 (Thai ข) in UTF-32LE: 02 0E 00 00 - assertParity(new byte[]{ - (byte) 0x01, (byte) 0x0E, (byte) 0x00, (byte) 0x00, - (byte) 0x02, (byte) 0x0E, (byte) 0x00, (byte) 0x00 - }); - } - - @Test - public void parityOnUtf32BeNonAscii() { - // U+0E01 in UTF-32BE: 00 00 0E 01 - // U+0E02 in UTF-32BE: 00 00 0E 02 - assertParity(new byte[]{ - (byte) 0x00, (byte) 0x00, (byte) 0x0E, (byte) 0x01, - (byte) 0x00, (byte) 0x00, (byte) 0x0E, (byte) 0x02 - }); - } - - @Test - public void parityOnDenseHighBytes() { - // All high bytes: typical of KOI8-R or similar - byte[] dense = new byte[64]; - for (int i = 0; i < dense.length; i++) { - dense[i] = (byte) (0xC0 + (i % 64)); - } - assertParity(dense); - } - - @Test - public void parityOnSingleByte() { - assertParity(new byte[]{(byte) 0xE0}); - } - - @Test - public void parityOnTwoBytes() { - assertParity(new byte[]{(byte) 0xE0, (byte) 0xE1}); - } - - @Test - public void parityOnEmpty() { - assertParity(new byte[0]); - } - - @Test - public void parityOnRealUtf16Le() { - // Encode actual Unicode text as UTF-16LE to get a realistic probe - String text = "日本語テスト"; // Japanese - assertParity(text.getBytes(StandardCharsets.UTF_16LE)); - } - - @Test - public void parityOnRealUtf16Be() { - String text = "日本語テスト"; - assertParity(text.getBytes(StandardCharsets.UTF_16BE)); - } - - @Test - public void parityOnRealUtf32() { - // UTF-32 via Charset.forName - Charset utf32le = Charset.forName("UTF-32LE"); - Charset utf32be = Charset.forName("UTF-32BE"); - String text = "Hello世界"; - assertParity(text.getBytes(utf32le)); - assertParity(text.getBytes(utf32be)); - } - - @Test - public void parityOnLongProbe() { - // 4096-byte probe mixing ASCII and high bytes - byte[] probe = new byte[4096]; - for (int i = 0; i < probe.length; i++) { - probe[i] = (byte) ((i % 3 == 0) ? (0x80 + (i % 128)) : (0x20 + (i % 96))); - } - assertParity(probe); - } - - // --- Internal consistency: extract() == extractSparseInto() within each extractor --- - - @Test - public void productionDenseMatchesSparse() { - String text = "日本語テスト résumé"; - byte[] probe = text.getBytes(StandardCharsets.UTF_16LE); - assertDenseSparseMatch(production, probe); - } - - @Test - public void configurableDenseMatchesSparse() { - String text = "日本語テスト résumé"; - byte[] probe = text.getBytes(StandardCharsets.UTF_16LE); - - int[] dense = configurable.extract(probe); - int[] sparseDense = new int[NUM_BUCKETS]; - int[] touched = new int[NUM_BUCKETS]; - int n = configurable.extractSparseInto(probe, sparseDense, touched); - - assertArrayEquals(dense, sparseDense, - "ConfigurableByteNgramFeatureExtractor: extract() vs extractSparseInto() differ"); - } - - // --- Helpers --- - - private void assertParity(byte[] probe) { - int[] prodFeatures = production.extract(probe); - int[] confFeatures = configurable.extract(probe); - - assertEquals(prodFeatures.length, confFeatures.length, - "Feature vector lengths differ"); - - // Find first mismatch for a useful error message - for (int i = 0; i < prodFeatures.length; i++) { - if (prodFeatures[i] != confFeatures[i]) { - StringBuilder sb = new StringBuilder(); - sb.append(String.format( - "Bucket %d: production=%d, configurable=%d. Probe (%d bytes): [", - i, prodFeatures[i], confFeatures[i], probe.length)); - int show = Math.min(probe.length, 32); - for (int j = 0; j < show; j++) { - if (j > 0) sb.append(' '); - sb.append(String.format("%02X", probe[j] & 0xFF)); - } - if (probe.length > show) sb.append(" ..."); - sb.append(']'); - org.junit.jupiter.api.Assertions.fail(sb.toString()); - } - } - } - - private void assertDenseSparseMatch(ByteNgramFeatureExtractor ext, byte[] probe) { - int[] dense = ext.extract(probe); - int[] sparseDense = new int[NUM_BUCKETS]; - int[] touched = new int[NUM_BUCKETS]; - int n = ext.extractSparseInto(probe, sparseDense, touched); - - assertArrayEquals(dense, sparseDense, - "ByteNgramFeatureExtractor: extract() vs extractSparseInto() differ"); - } - - // ===================================================================== - // Parity in the split-spaces + globals layout (next-generation model). - // ===================================================================== - - private static final int SPLIT_NUM_BUCKETS = - 32768 + ByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT; - - private final ByteNgramFeatureExtractor productionSplit = - new ByteNgramFeatureExtractor(SPLIT_NUM_BUCKETS, true, true); - - private final ConfigurableByteNgramFeatureExtractor configurableSplit = - new ConfigurableByteNgramFeatureExtractor( - SPLIT_NUM_BUCKETS, - true, // unigrams - true, // bigrams - false, // trigrams OFF - false, // anchored OFF - true, // stride2 ON - true, // globals ON - true); // split spaces ON - - private void assertSplitParity(byte[] probe) { - int[] prodFeatures = productionSplit.extract(probe); - int[] confFeatures = configurableSplit.extract(probe); - assertEquals(prodFeatures.length, confFeatures.length, - "split-layout feature vector lengths differ"); - for (int i = 0; i < prodFeatures.length; i++) { - if (prodFeatures[i] != confFeatures[i]) { - org.junit.jupiter.api.Assertions.fail(String.format( - "split-layout bucket %d: production=%d, configurable=%d", - i, prodFeatures[i], confFeatures[i])); - } - } - } - - @Test - public void splitParityOnPureAscii() { - assertSplitParity("Hello, world! This is ASCII text.\r\n" - .getBytes(StandardCharsets.US_ASCII)); - } - - @Test - public void splitParityOnHighByteContent() { - assertSplitParity(new byte[]{ - (byte) 0x72, (byte) 0xE9, (byte) 0x73, (byte) 0x75, - (byte) 0x6D, (byte) 0xE9, (byte) 0x20, - (byte) 0x63, (byte) 0x61, (byte) 0x66, (byte) 0xE9 - }); - } - - @Test - public void splitParityOnRealUtf16Le() { - assertSplitParity("日本語テスト".getBytes(StandardCharsets.UTF_16LE)); - } - - @Test - public void splitParityOnArabicLike() { - // Synthesized Arabic-style byte pattern: 0xC7/0xE1/0xE3 alef/lam/meem - byte[] probe = new byte[]{ - (byte) 0xC7, (byte) 0xE1, (byte) 0xE3, 0x20, - (byte) 0xD9, (byte) 0xED, (byte) 0xC7, (byte) 0xE1, - (byte) 0xCA, (byte) 0xD1, 0x0D, 0x0A - }; - assertSplitParity(probe); - } - - @Test - public void splitParityOnLongMixedProbe() { - byte[] probe = new byte[4096]; - for (int i = 0; i < probe.length; i++) { - probe[i] = (byte) ((i % 3 == 0) ? (0x80 + (i % 128)) : (0x20 + (i % 96))); - } - assertSplitParity(probe); - } - - @Test - public void splitLayoutProductionDenseMatchesSparse() { - byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE); - int[] dense = productionSplit.extract(probe); - int[] sparseDense = new int[SPLIT_NUM_BUCKETS]; - int[] touched = new int[SPLIT_NUM_BUCKETS]; - productionSplit.extractSparseInto(probe, sparseDense, touched); - assertArrayEquals(dense, sparseDense, - "split layout: production extract() vs extractSparseInto() differ"); - } - - @Test - public void splitLayoutConfigurableDenseMatchesSparse() { - byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE); - int[] dense = configurableSplit.extract(probe); - int[] sparseDense = new int[SPLIT_NUM_BUCKETS]; - int[] touched = new int[SPLIT_NUM_BUCKETS]; - configurableSplit.extractSparseInto(probe, sparseDense, touched); - assertArrayEquals(dense, sparseDense, - "split layout: configurable extract() vs extractSparseInto() differ"); - } -}
