This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch charset-ship-today in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5b2e867ef59073893a7f9422f229d04d8a92ea8d Author: tallison <[email protected]> AuthorDate: Fri Apr 17 09:35:21 2026 -0400 through step 6 --- .../tika/langdetect/charsoup/HtmlStripper.java | 82 +++++++++++----- .../tika/langdetect/charsoup/HtmlStripperTest.java | 62 ++++++++---- .../ml/chardetect/MojibusterEncodingDetector.java | 32 +++++- .../ml/chardetect/tools/TrainCharsetModel.java | 108 ++++++++++++++------- 4 files changed, 207 insertions(+), 77 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java index f36aa635c1..fd6ef5f78a 100644 --- a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java @@ -17,16 +17,16 @@ package org.apache.tika.langdetect.charsoup; /** - * HTML/XML markup stripping tuned for language scoring. Not a full HTML - * parser — purpose-built to feed character-bigram language detectors a - * markup-free string that still carries the page's content language. + * HTML/XML markup stripping tuned for language scoring and charset + * disambiguation. Not a full HTML parser — purpose-built to feed + * character-bigram language detectors a markup-free string that still + * carries the page's content language. * * <p>Real-world HTML probes are routinely 95-99% markup by byte count. * Without this pass, a language detector sees the markup as its primary * input — which on any HTML page looks predominantly like ASCII English - * regardless of the page's actual content language. Stripping markup - * (and decoding numeric entities, which can carry content) lets the - * detector see the actual content. + * regardless of the page's actual content language. Stripping the markup + * lets the detector see the actual content. * * <h3>What it does, in one linear pass</h3> * <ul> @@ -37,15 +37,23 @@ package org.apache.tika.langdetect.charsoup; * <li>Removes {@code <!-- ... -->} comments.</li> * <li>Removes {@code <...>} tag markup (element names, attribute names, * attribute values).</li> - * <li><em>Decodes</em> numeric character references ({@code Ӓ}, - * {@code ꯍ}) to their actual code points — these can carry - * the page's primary content (e.g. Korean-charset pages that emit - * simplified-Chinese-only ideographs via numeric entities for - * cross-charset compatibility).</li> * <li>Replaces named entity references ({@code &}, {@code }, * {@code ©}) with a space — these are nearly always * punctuation/typography with low language signal, and a full * named-entity table would be heavyweight.</li> + * <li>Default ({@link #strip(String)}): <strong>drops numeric character + * references</strong> ({@code Ӓ}, {@code ꯍ}) to a single + * space, on the grounds that a single numeric-entity-heavy section + * can expand to a very different byte distribution than the raw + * probe we are trying to characterise — at charset-detection time + * we want to score the raw bytes, not a synthetic Unicode rendering + * of them.</li> + * <li>Opt-in ({@link #stripAndDecodeNumeric(String)}): <em>decodes</em> + * numeric character references to their actual code points. Useful + * where numeric entities carry the page's primary content (e.g. + * pages that emit CJK ideographs via {@code &#NNNN;} for + * cross-charset compatibility, so the decoded content reaches a + * downstream language scorer).</li> * </ul> * * <h3>What it doesn't do</h3> @@ -68,14 +76,34 @@ public final class HtmlStripper { } /** - * Strip markup from {@code text} and return the content with numeric - * entities decoded. See class javadoc for details. + * Strip markup from {@code text}. Numeric character references are + * dropped to a space — same treatment as named entities. See class + * javadoc for details. Use {@link #stripAndDecodeNumeric(String)} when + * a caller specifically needs numeric entities decoded. * * @param text input string (HTML/XML or plain text); {@code null} or empty * returns the input unchanged - * @return content with markup removed and numeric entities decoded + * @return content with markup removed and entity references dropped to space */ public static String strip(String text) { + return strip(text, false); + } + + /** + * Strip markup from {@code text}, decoding numeric character references + * to their actual code points. Use when numeric entities carry content + * the downstream consumer needs to see (e.g. language scoring on pages + * that emit CJK ideographs as {@code &#NNNN;}). + * + * @param text input string (HTML/XML or plain text); {@code null} or empty + * returns the input unchanged + * @return content with markup removed and numeric entities decoded + */ + public static String stripAndDecodeNumeric(String text) { + return strip(text, true); + } + + private static String strip(String text, boolean decodeNumericEntities) { if (text == null || text.isEmpty()) { return text; } @@ -87,7 +115,7 @@ public final class HtmlStripper { if (c == '<') { i = handleOpenAngle(text, i, n, out); } else if (c == '&') { - i = handleAmpersand(text, i, n, out); + i = handleAmpersand(text, i, n, out, decodeNumericEntities); } else { out.append(c); i++; @@ -114,8 +142,15 @@ public final class HtmlStripper { return end < 0 ? n : end + 1; } - /** Handle a {@code &} — numeric entity (decode), named entity (drop), or literal. */ - private static int handleAmpersand(String s, int i, int n, StringBuilder out) { + /** + * Handle a {@code &} — numeric entity, named entity, or literal. When + * {@code decodeNumericEntities} is {@code true}, valid numeric entities + * are decoded to their Unicode code point; otherwise they are dropped + * to a space, same as named entities. An unparseable numeric entity is + * always dropped to space (it's not literal text even in no-decode mode). + */ + private static int handleAmpersand(String s, int i, int n, StringBuilder out, + boolean decodeNumericEntities) { // Look for ; within a small window — entity references are short. int max = Math.min(n, i + 12); int semi = -1; @@ -135,12 +170,15 @@ public final class HtmlStripper { } // Numeric entity? if (semi >= i + 3 && s.charAt(i + 1) == '#') { - int cp = parseNumericEntity(s, i + 2, semi); - if (cp >= 0) { - appendCodePointSafe(out, cp); - return semi + 1; + if (decodeNumericEntities) { + int cp = parseNumericEntity(s, i + 2, semi); + if (cp >= 0) { + appendCodePointSafe(out, cp); + return semi + 1; + } } - // Unparseable numeric entity — treat as space (it's not literal text). + // Default (no-decode) path, or unparseable numeric in decode mode: + // drop to a space — numeric entities are not literal text. out.append(' '); return semi + 1; } diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java index b21b2c8a40..8c5f816042 100644 --- a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java @@ -63,30 +63,41 @@ public class HtmlStripperTest { } @Test - public void handlesEntities() { - // Named entities (e.g. &, ) → stripped to space (low signal, - // and a full named-entity table is heavyweight). - // Numeric entities (e.g. Ӓ, ”) → DECODED to their actual - // code point so the content reaches the language detector. This - // matters for files where the page's primary content is delivered - // via numeric entities (e.g. industrial-product pages emitting CJK - // ideographs as &#NNNN; for cross-charset compatibility). + public void handlesEntitiesDefault() { + // Default strip(): both named and numeric entities are dropped to a + // space. Numeric decode is opt-in via stripAndDecodeNumeric(); the + // default target is charset detection on raw bytes, where a big + // numeric-entity expansion would distort what we're measuring. String stripped = HtmlStripper.strip( "<p>&hello world–test”end</p>"); assertFalse(stripped.contains("&"), "No entity references should survive: " + stripped); - // 0x2013 = en-dash, 0x201D = right double quote — should appear as - // actual chars, not as entity references nor as spaces. + assertFalse(stripped.contains("\u2013"), + "Default strip must NOT decode numeric entities: " + stripped); + assertFalse(stripped.contains("\u201D"), + "Default strip must NOT decode numeric entities: " + stripped); + assertTrue(stripped.contains("hello")); + assertTrue(stripped.contains("world")); + } + + @Test + public void decodeVariantDecodesEntities() { + // stripAndDecodeNumeric() preserves the legacy behaviour: named + // entities → space, numeric entities → actual code point. Kept for + // callers that need the content behind numeric entities (e.g. + // language scoring on pages that emit CJK ideographs as &#NNNN;). + String stripped = HtmlStripper.stripAndDecodeNumeric( + "<p>&hello world–test”end</p>"); + assertFalse(stripped.contains("&"), + "No entity references should survive: " + stripped); assertTrue(stripped.contains("\u2013"), "Numeric entity – should decode to en-dash: " + stripped); assertTrue(stripped.contains("\u201D"), "Numeric entity ” should decode to right double quote: " + stripped); - assertTrue(stripped.contains("hello")); - assertTrue(stripped.contains("world")); } @Test - public void decodesCjkNumericEntities() { + public void decodeVariantDecodesCjkNumericEntities() { // Real-world case: industrial-product pages that emit CJK ideographs // via numeric entities (so they render correctly regardless of the // page's declared charset). The decoded content must reach the @@ -94,7 +105,7 @@ public class HtmlStripperTest { // ASCII markup and concludes "English" no matter what the page is // actually about. String input = "<p>过滤离 cyclone</p>"; - String stripped = HtmlStripper.strip(input); + String stripped = HtmlStripper.stripAndDecodeNumeric(input); assertTrue(stripped.contains("\u8FC7"), "0x8FC7 (过) should decode: " + stripped); assertTrue(stripped.contains("\u6EE4"), @@ -104,11 +115,28 @@ public class HtmlStripperTest { } @Test - public void rejectsInvalidNumericEntities() { + public void defaultDropsCjkNumericEntitiesToSpaces() { + // The inverse of decodeVariantDecodesCjkNumericEntities: default + // strip() drops all numeric entities. This is what we want for + // raw-byte charset-detection scoring — the CJK ideographs are not + // part of the probe we are characterising. + String input = "<p>过滤离 cyclone</p>"; + String stripped = HtmlStripper.strip(input); + assertFalse(stripped.contains("\u8FC7"), "default must not decode: " + stripped); + assertFalse(stripped.contains("\u6EE4"), "default must not decode: " + stripped); + assertFalse(stripped.contains("\u79BB"), "default must not decode: " + stripped); + assertTrue(stripped.contains("cyclone")); + } + + @Test + public void rejectsInvalidNumericEntitiesInDecodeVariant() { // Surrogate-half codepoints, control chars, and out-of-range numbers // should be replaced with a space rather than emitted (they would - // either crash the language detector or skew scores). - String stripped = HtmlStripper.strip("good�bad�bad�good"); + // either crash the language detector or skew scores). Applies to + // the decode-numeric variant; the default already drops everything + // numeric to a space regardless of validity. + String stripped = HtmlStripper.stripAndDecodeNumeric( + "good�bad�bad�good"); assertFalse(stripped.contains("\uD800"), "Surrogate code point should not be emitted: " + stripped); assertTrue(stripped.contains("good")); diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index c650284f53..23181ddb7e 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -108,15 +108,28 @@ public class MojibusterEncodingDetector implements EncodingDetector { */ CRLF_TO_WINDOWS, /** - * When the top candidate is a single-byte Latin-family charset - * (see {@link CharsetConfusables#SBCS_LATIN_FAMILY}) other than + * On <strong>short probes only</strong>, when the top candidate is a + * single-byte Latin-family charset (see + * {@link CharsetConfusables#SBCS_LATIN_FAMILY}) other than * windows-1252, and the probe decodes byte-identically under * windows-1252, swap the result to windows-1252 as the unmarked - * Latin default. Cheap per-probe byte walk via + * Latin default. + * + * <p>Short-probe gate: the rule only fires when + * {@code probe.length < SHORT_PROBE_THRESHOLD} (currently 50 bytes). + * On longer probes the model has seen enough high-byte evidence to + * discriminate sibling Latin code pages (windows-1250/1254/1257, + * ISO-8859-X) genuinely — rewriting to windows-1252 there would + * erase real distinctions. On short probes the model is falling + * back to bias, which is where sparse-Latin vCard-style content + * false-positives as IBM424 / windows-1257 / x-MacRoman; this gate + * catches those.</p> + * + * <p>Per-probe byte walk via * {@link DecodeEquivalence#byteIdenticalOnProbe}; short-circuits on * the first disagreeing high byte. Zero cost for probes whose top * candidate isn't Latin-family (CJK, UTF-*, EBCDIC, Cyrillic, - * Arabic, Greek, Hebrew). + * Arabic, Greek, Hebrew).</p> */ LATIN_FALLBACK_WIN1252 } @@ -536,7 +549,16 @@ public class MojibusterEncodingDetector implements EncodingDetector { results = selectAtLeast(model, logits, MIN_CANDIDATES, probe, grammar); } - if (enabledRules.contains(Rule.LATIN_FALLBACK_WIN1252)) { + // LATIN_FALLBACK_WIN1252 is gated to short probes only. On long probes + // the model has enough high-byte evidence to discriminate sibling Latin + // code pages (windows-1250/1254/1257/ISO-8859-X) and we trust it; + // forcing a rewrite to windows-1252 would erase those distinctions. + // Short probes (< SHORT_PROBE_THRESHOLD bytes) are where the model + // falls back to bias — that's where the fallback prevents + // IBM424/windows-1257/x-MacRoman false positives on sparse-Latin + // vCard-style content. + if (enabledRules.contains(Rule.LATIN_FALLBACK_WIN1252) + && probe.length < SHORT_PROBE_THRESHOLD) { results = applyLatinFallback(probe, results); } diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java index d7379f4c8b..b46e89ea6c 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java @@ -70,36 +70,60 @@ public class TrainCharsetModel { private static final int DEFAULT_MAX_SAMPLES = 500_000; /** - * Labels excluded from the main SBCS "kitchen-sink" model by default. + * Labels the main SBCS "kitchen-sink" model is trained on today. * - * <p>Hardcoded here (rather than passed on the command line) so the model's - * class set is versioned in git alongside the code that uses it — past - * retraining runs with inconsistent CLI flags were a recurring source of - * mismatched inference/training feature sets.</p> + * <p>Include-list semantics (not exclude): {@link BuildCharsetTrainingData} + * generates training corpora for many more labels than these (EBCDIC + * nationals, DOS OEM, Mac charsets, extended ISO-8859 variants, etc.), + * pre-positioned for future specialists; today's SBCS consumes only the + * explicit set below. Hardcoded here so the model's class set is + * versioned in git alongside the code that uses it — past retraining + * runs with inconsistent CLI flags were a recurring source of mismatched + * inference/training feature sets.</p> * - * <p>{@link BuildCharsetTrainingData} still generates training corpora for - * these labels — they are needed by future specialists (e.g. an EBCDIC - * specialist) — but the main SBCS model doesn't consume them today:</p> + * <p>Baseline is the v6 label set ({@code chardetect-v6-no-utf32.bin}, + * 35 classes), with these changes:</p> * <ul> - * <li><b>IBM424-ltr/rtl</b> (Hebrew EBCDIC) — content bytes occupy 0x41–0x6A, - * entirely below the 0x80 threshold the shipped + * <li><b>Removed</b> {@code IBM424-ltr/rtl}, {@code IBM420-ltr/rtl} + * (Hebrew/Arabic EBCDIC) — content bytes occupy {@code 0x41–0x6A}, + * entirely below the {@code 0x80} threshold the shipped * {@link ByteNgramFeatureExtractor} considers. Training on these - * labels teaches weights the inference path cannot ever match.</li> - * <li><b>IBM420-ltr/rtl</b> (Arabic EBCDIC) — same reason as IBM424.</li> - * <li><b>IBM1047</b> (z/OS Unix System Services Latin-1) — byte-identical - * to IBM500 on most prose; having both as classes just splits the - * EBCDIC-Latin signal without adding discrimination the model can - * use.</li> + * labels teaches weights the inference path cannot match.</li> + * <li><b>Removed</b> {@code IBM1047} — byte-identical to {@code IBM500} + * on most prose; having both as classes splits the EBCDIC-Latin + * signal without adding discrimination.</li> + * <li><b>Removed</b> {@code UTF-16-LE} / {@code UTF-16-BE} — owned by + * {@code Utf16SpecialistEncodingDetector}; no longer emitted as + * main-model classes (same reasoning the v6 name + * "{@code -no-utf32}" captures for UTF-32).</li> + * <li><b>Added</b> {@code x-windows-949} — Korean MS949, strict + * superset of EUC-KR; trained as a separate class so the model + * can discriminate MS949-extension-byte content from pure + * EUC-KR.</li> * </ul> - * - * <p>CLI {@code --exclude} is unioned with this set, not replaced, so an - * operator can add further exclusions but cannot accidentally suppress - * the hardcoded policy.</p> */ - static final Set<String> TODAY_SBCS_EXCLUDE = Set.of( - "IBM424-ltr", "IBM424-rtl", - "IBM420-ltr", "IBM420-rtl", - "IBM1047"); + static final Set<String> TODAY_SBCS_INCLUDE = Set.of( + // CJK (multi-byte) + "Big5-HKSCS", "EUC-JP", "EUC-KR", "x-windows-949", + "GB18030", "Shift_JIS", "x-EUC-TW", + // Unicode + "UTF-8", + // EBCDIC (international Latin only — other variants deferred to specialist) + "IBM500", + // DOS / OEM Latin (retained from v6) + "IBM850", "IBM852", + // Cyrillic + "IBM855", "IBM866", "KOI8-R", "KOI8-U", + "windows-1251", "x-mac-cyrillic", + // Windows single-byte + "windows-1250", "windows-1252", "windows-1253", "windows-1254", + "windows-1255", "windows-1256", "windows-1257", "windows-1258", + "windows-874", + // ISO-8859 (only the ones v6 kept as distinct labels; 1/2/4/9 fold + // into their windows-12XX supersets) + "ISO-8859-3", "ISO-8859-16", + // Mac + "x-MacRoman"); public static void main(String[] args) throws IOException { Path dataDir = null; @@ -111,9 +135,10 @@ public class TrainCharsetModel { // --label-remap src1:dst1,src2:dst2 — merges multiple source labels into // one target label at training time (e.g. merge script variants into one class). Map<String, String> labelRemap = new HashMap<>(); - // Start from the hardcoded SBCS-kitchen-sink exclusion list; CLI - // --exclude adds to it but cannot override. - Set<String> excludeLabels = new java.util.HashSet<>(TODAY_SBCS_EXCLUDE); + // CLI --exclude adds extra labels to drop *on top of* the include-list + // policy (used for ablation experiments). Cannot override the include + // list — labels not in the policy are excluded regardless. + Set<String> excludeLabels = new java.util.HashSet<>(); for (int i = 0; i < args.length; i++) { switch (args[i]) { @@ -164,27 +189,44 @@ public class TrainCharsetModel { System.err.println(" --max-samples-per-class N"); System.err.println(" --label-remap src1:dst1,src2:dst2"); System.err.println(" merge source labels into a single target label"); - System.err.println(" --exclude cs1,cs2 ADD these to the hardcoded exclusion list " - + TODAY_SBCS_EXCLUDE); + System.err.println(" --exclude cs1,cs2 drop these additionally on top of the hardcoded " + + "include list (" + TODAY_SBCS_INCLUDE.size() + " classes in TODAY_SBCS_INCLUDE)"); System.exit(1); } - // Discover charset files + // Discover charset files. Include-list policy: only labels in + // TODAY_SBCS_INCLUDE are admitted, regardless of what files exist in + // dataDir (which may contain future-specialist corpora — Mac, DOS + // OEM, EBCDIC nationals, etc.). CLI --exclude can drop further + // labels for ablation. List<Path> charsetFiles = Files.list(dataDir) .filter(p -> p.getFileName().toString().endsWith(".bin.gz")) .filter(p -> { String cs = p.getFileName().toString().replaceAll("\\.bin\\.gz$", ""); - return !excludeLabels.contains(cs); + return TODAY_SBCS_INCLUDE.contains(cs) && !excludeLabels.contains(cs); }) .sorted() .collect(Collectors.toList()); + System.out.println("TODAY_SBCS_INCLUDE (" + TODAY_SBCS_INCLUDE.size() + " classes): " + + new java.util.TreeSet<>(TODAY_SBCS_INCLUDE)); if (!excludeLabels.isEmpty()) { - System.out.println("Excluded labels: " + excludeLabels); + System.out.println("Additional CLI --exclude: " + excludeLabels); + } + // Report any include-list classes that had no matching file on disk. + java.util.Set<String> foundLabels = charsetFiles.stream() + .map(p -> p.getFileName().toString().replaceAll("\\.bin\\.gz$", "")) + .collect(Collectors.toCollection(java.util.TreeSet::new)); + java.util.Set<String> missing = new java.util.TreeSet<>(TODAY_SBCS_INCLUDE); + missing.removeAll(foundLabels); + missing.removeAll(excludeLabels); + if (!missing.isEmpty()) { + System.err.println("WARNING: include-list classes with no data file in " + + dataDir + ": " + missing); } if (charsetFiles.isEmpty()) { - System.err.println("No .bin.gz files found in: " + dataDir); + System.err.println("No matching .bin.gz files found in: " + dataDir); System.exit(1); }
