This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4731-common-script in repository https://gitbox.apache.org/repos/asf/tika.git
commit 02e2f9b9d550024c6be105f4d03be269faf0c7ce Author: tallison <[email protected]> AuthorDate: Tue May 19 11:34:07 2026 -0400 TIKA-4731 - strip html entities, rebalance training, add modest clipping --- .../tika/ml/chardetect/HtmlByteStripper.java | 171 +++++++++++++++++- .../ml/chardetect/MojibusterEncodingDetector.java | 18 +- .../NaiveBayesBigramEncodingDetector.java | 170 +++++++++++++----- .../org/apache/tika/ml/chardetect/nb-bigram.bin | Bin 1020112 -> 1016638 bytes .../tika/ml/chardetect/HtmlByteStripperTest.java | 191 +++++++++++++++++++++ .../apache/tika/ml/chardetect/TraceMojibuster.java | 142 +++++++++++++-- .../ml/chardetect/tools/TrainNaiveBayesBigram.java | 41 ++++- 7 files changed, 662 insertions(+), 71 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java index 1443ae9723..09768e0977 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java @@ -66,6 +66,29 @@ public final class HtmlByteStripper { * end at the next {@code >}. Internal subsets ({@code <!DOCTYPE foo [ ... ]>}) * are rare; we'd stop at the first nested {@code >}. Acceptable. */ private static final int DECL_OR_PI = 10; + /** Just saw {@code &}. Next byte decides whether this is a named + * entity ({@code &}), a numeric reference ({@code ©} or + * {@code ©}), or a stray ampersand. */ + private static final int ENTITY = 11; + private static final int ENTITY_NAME = 12; + /** Just saw {@code &#}. Next byte: {@code x}/{@code X} for hex, or + * decimal digit for decimal. */ + private static final int ENTITY_NUM = 13; + private static final int ENTITY_DEC = 14; + private static final int ENTITY_HEX = 15; + + /** + * Maximum body length for an entity (bytes after {@code &}, including + * any {@code #}/{@code x} prefix, excluding the trailing {@code ;}). + * Standard HTML5 named entities are at most 32 bytes + * ({@code ∳}), but only a few dozen + * exceed 16; 99% of real-world entities are well under that. A 16-byte + * cap covers the common cases ({@code }, {@code «}, + * {@code Á}, {@code …}, {@code ·}, etc.) and + * bounds pathological input that might otherwise eat language-content + * bytes before bailing out. + */ + private static final int MAX_ENTITY_BODY_LEN = 16; private static final byte[] SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'}; private static final byte[] STYLE = {'s', 't', 'y', 'l', 'e'}; @@ -105,10 +128,13 @@ public final class HtmlByteStripper { public final int length; /** Number of well-formed tags parsed (including comments). */ public final int tagCount; + /** Number of well-formed HTML entities stripped from TEXT. */ + public final int entityCount; - public Result(int length, int tagCount) { + public Result(int length, int tagCount, int entityCount) { this.length = length; this.tagCount = tagCount; + this.entityCount = entityCount; } } @@ -140,6 +166,11 @@ public final class HtmlByteStripper { int rawMatch = 0; int end = srcOffset + srcLen; int tagCount = 0; + int entityCount = 0; + // Position of the leading '&' for the in-progress entity. + // Tracked so the bailout path can emit the consumed prefix + // as literal text when the parse fails (e.g. "AT&T"). + int entityStart = 0; int attrNameStart = 0; // When true, the current quoted attribute value's bytes are // emitted to dst (attribute name matched TEXT_ATTRS). Reset @@ -152,6 +183,9 @@ public final class HtmlByteStripper { case TEXT: if (b == '<') { state = LT; + } else if (b == '&') { + state = ENTITY; + entityStart = i; } else { dst[w++] = b; } @@ -189,6 +223,121 @@ public final class HtmlByteStripper { } break; + case ENTITY: + // First byte after '&' decides path. + if (b == '#') { + state = ENTITY_NUM; + } else if (isAsciiLetter(b)) { + state = ENTITY_NAME; + } else { + // Not entity-shaped: emit consumed prefix (just '&') + // and re-process b under TEXT semantics. + for (int k = entityStart; k < i; k++) { + dst[w++] = src[k]; + } + if (b == '<') { + state = LT; + } else if (b == '&') { + entityStart = i; + // state stays ENTITY + } else { + dst[w++] = b; + state = TEXT; + } + } + break; + + case ENTITY_NAME: + if (b == ';') { + entityCount++; + state = TEXT; + } else if (isAsciiLetter(b) + && (i - entityStart) <= MAX_ENTITY_BODY_LEN) { + // continue accumulating; no emit + } else { + for (int k = entityStart; k < i; k++) { + dst[w++] = src[k]; + } + if (b == '<') { + state = LT; + } else if (b == '&') { + entityStart = i; + state = ENTITY; + } else { + dst[w++] = b; + state = TEXT; + } + } + break; + + case ENTITY_NUM: + // First byte after '&#': 'x'/'X' for hex, digit for decimal. + if (b == 'x' || b == 'X') { + state = ENTITY_HEX; + } else if (isAsciiDigit(b)) { + state = ENTITY_DEC; + } else { + for (int k = entityStart; k < i; k++) { + dst[w++] = src[k]; + } + if (b == '<') { + state = LT; + } else if (b == '&') { + entityStart = i; + state = ENTITY; + } else { + dst[w++] = b; + state = TEXT; + } + } + break; + + case ENTITY_DEC: + if (b == ';') { + entityCount++; + state = TEXT; + } else if (isAsciiDigit(b) + && (i - entityStart) <= MAX_ENTITY_BODY_LEN) { + // continue + } else { + for (int k = entityStart; k < i; k++) { + dst[w++] = src[k]; + } + if (b == '<') { + state = LT; + } else if (b == '&') { + entityStart = i; + state = ENTITY; + } else { + dst[w++] = b; + state = TEXT; + } + } + break; + + case ENTITY_HEX: + if (b == ';') { + entityCount++; + state = TEXT; + } else if (isHexDigit(b) + && (i - entityStart) <= MAX_ENTITY_BODY_LEN) { + // continue + } else { + for (int k = entityStart; k < i; k++) { + dst[w++] = src[k]; + } + if (b == '<') { + state = LT; + } else if (b == '&') { + entityStart = i; + state = ENTITY; + } else { + dst[w++] = b; + state = TEXT; + } + } + break; + case TAG_NAME: if (isTagNameTerminator(b)) { int nameLen = i - nameStart; @@ -332,7 +481,15 @@ public final class HtmlByteStripper { } } - return new Result(w - dstOffset, tagCount); + // Unterminated entity at EOF: emit consumed prefix as literal text. + if (state == ENTITY || state == ENTITY_NAME || state == ENTITY_NUM + || state == ENTITY_DEC || state == ENTITY_HEX) { + for (int k = entityStart; k < end; k++) { + dst[w++] = src[k]; + } + } + + return new Result(w - dstOffset, tagCount, entityCount); } /** @@ -350,6 +507,16 @@ public final class HtmlByteStripper { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } + private static boolean isAsciiDigit(byte b) { + int c = b & 0xFF; + return c >= '0' && c <= '9'; + } + + private static boolean isHexDigit(byte b) { + int c = b & 0xFF; + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); + } + private static boolean isTagNameTerminator(byte b) { return b == ' ' || b == '\t' || b == '\n' || b == '\r' || b == '>' || b == '/'; } diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 806e1a2251..78dc9400ae 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -97,6 +97,15 @@ public class MojibusterEncodingDetector implements EncodingDetector { */ private static final int MIN_TAG_COUNT_TO_USE_STRIP = 1; + /** + * Minimum HTML entity count to apply the stripper even when no + * well-formed tags are present. A single stray {@code &} + * mention in plain prose shouldn't trigger the strip path, but + * entity-heavy content (HTML-quoted text in a plain-text file, + * truncated reads where the leading tag was lost, etc.) should. + */ + private static final int MIN_ENTITY_COUNT_TO_USE_STRIP = 3; + /** * Confidence attached to UTF-32 structural candidates — high but * sub-1.0 so the ResultType.STRUCTURAL flag carries meaning @@ -674,10 +683,11 @@ public class MojibusterEncodingDetector implements EncodingDetector { byte[] dst = new byte[probe.length]; HtmlByteStripper.Result stripped = HtmlByteStripper.strip(probe, 0, probe.length, dst, 0); - if (stripped.tagCount < MIN_TAG_COUNT_TO_USE_STRIP) { - // No well-formed tags found — probe isn't markup (or the - // bytes don't parse as markup in any ASCII-compatible - // reading). Use original. + if (stripped.tagCount < MIN_TAG_COUNT_TO_USE_STRIP + && stripped.entityCount < MIN_ENTITY_COUNT_TO_USE_STRIP) { + // No well-formed tags AND not enough entities to be markup — + // probe isn't markup (or the bytes don't parse as markup in + // any ASCII-compatible reading). Use original. return probe; } byte[] trimmed = new byte[stripped.length]; diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java index 98aae326b7..84d721bc12 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java @@ -101,7 +101,53 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { * of how uncertain the model actually is, so it cannot serve as * a candidate-emission gate.</p> */ - private static final double MARGIN_THRESHOLD_NATS_PER_BIGRAM = 0.20; + public static final double MARGIN_THRESHOLD_NATS_PER_BIGRAM = 0.20; + + /** + * Per-bigram cross-class total-contribution cap (Type C clipping). + * For each distinct bigram in the probe, the top-scoring class's + * total contribution (count × logP × idf, after dequantization) is + * capped at the runner-up class's contribution + this many nats. + * + * <p>Defends against corpus-skew pathologies where one class + * accumulates extreme bigram mass that swings classification on + * one or two byte-pairs alone (e.g., Czech "ČR" digraph in + * ISO-8859-2 contributing +186 nats over win-1252 on Italian text). + * Length-invariant by construction: the cap is on per-bigram + * advantage, regardless of how many times the bigram appears.</p> + * + * <p>20 nats = e^20 ≈ 5×10^8 probability-ratio advantage per + * bigram — preserves legitimate CJK-vs-Latin and other cross-script + * signal while bounding the diffuse-corpus-skew tail.</p> + */ + public static final double CAP_PER_BIGRAM_NATS = 20.0; + + /** + * Minimum distinct bigrams required before the per-bigram cap + * applies. On short probes, each bigram carries proportionally + * more signal — clipping would destroy more discrimination than + * it saves. + */ + public static final int MIN_DISTINCT_FOR_CAP = 30; + + /** + * Minimum distinct-bigram fraction of total-scored-bigrams. Below + * this, the input is treated as degenerate (looped / repeated / + * corrupt) and {@link #scoreClassesAndCount(byte[])} returns + * {@code null} so callers can fall back. Defends against pathological + * inputs like {@code "thththth..."} where one bigram appears + * hundreds of times. + */ + public static final double MIN_DIVERSITY_RATIO = 0.02; + + /** + * Minimum scored bigrams required before the diversity gate + * applies. Short probes legitimately have lower diversity ratios + * (fewer total bigrams = fewer opportunities for distinct ones) + * and shouldn't be gated as degenerate. Above this floor, the + * ratio measurement is meaningful. + */ + public static final int MIN_BIGRAMS_FOR_DIVERSITY_GATE = 100; private final String[] labels; /** Charset objects cached at load — one {@code Charset.forName} per class, ever. */ @@ -263,45 +309,8 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { * tiny probes that can't be scored. */ public double[] scoreClasses(byte[] probe) { - if (probe == null || probe.length < 2) { - return null; - } - int len = Math.min(probe.length, MAX_PROBE_BYTES); - - // Integer hot loop — CharSoup-style. int8 logP × int8 IDF → - // int16 product, accumulated into int32 per class. Overflow - // safety: at MAX_PROBE_BYTES=16384, max 16383 bigrams × 127 × 127 - // ≈ 264M per class, well inside int32's 2.1B headroom. - int[] dots = new int[numClasses]; - for (int i = 0; i + 1 < len; i++) { - int b0 = probe[i] & 0xFF; - int b1 = probe[i + 1] & 0xFF; - // γ: bigrams where both bytes are ASCII whitespace carry no - // encoding signal, and per-class training-data preparation - // varies in how it handles consecutive whitespace (GB18030's - // training collapsed it; others retained it). That asymmetry - // can dominate scoring on HTML-stripped probes where - // whitespace bigrams are the highest-frequency tokens. Skip. - if (isWhitespace(b0) && isWhitespace(b1)) { - continue; - } - int bigram = (b0 << 8) | b1; - int w = idf8[bigram]; // non-negative, 0..127 - if (w == 0) { - continue; // bigram has no discriminative power; skip - } - int base = bigram * numClasses; - for (int c = 0; c < numClasses; c++) { - dots[c] += logP8[base + c] * w; - } - } - - // Single per-class dequantization at end of probe. - double[] score = new double[numClasses]; - for (int c = 0; c < numClasses; c++) { - score[c] = dots[c] * perClassDequant[c]; - } - return score; + ScoreResult sr = scoreClassesAndCount(probe); + return sr == null ? null : sr.scores; } /** @@ -371,7 +380,15 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { return null; } int len = Math.min(probe.length, MAX_PROBE_BYTES); - int[] dots = new int[numClasses]; + + // Pass 1: count distinct bigrams. Whitespace and zero-IDF + // bigrams are skipped as in the original hot loop. short[] is + // enough since count fits in 16383 (max possible). Track the + // ids of distinct bigrams in a parallel array so pass 2 doesn't + // need to scan the full 65k space. + short[] count = new short[BIGRAM_SPACE]; + int[] distinctBigrams = new int[len]; + int distinctIdx = 0; int scored = 0; int total = 0; for (int i = 0; i + 1 < len; i++) { @@ -387,14 +404,75 @@ public class NaiveBayesBigramEncodingDetector implements EncodingDetector { continue; } scored++; - int base = bigram * numClasses; - for (int c = 0; c < numClasses; c++) { - dots[c] += logP8[base + c] * w; + if (count[bigram] == 0) { + distinctBigrams[distinctIdx++] = bigram; } + count[bigram]++; } + + // Type A — diversity gate. If the input has too few distinct + // bigrams relative to total scored bigrams, it's a degenerate + // / looped input ("thththth..." or worse). Abstain — caller + // falls back. Only applied above a minimum scored-bigrams + // floor, since short probes legitimately have lower diversity + // ratios. + if (scored >= MIN_BIGRAMS_FOR_DIVERSITY_GATE + && (double) distinctIdx / scored < MIN_DIVERSITY_RATIO) { + return null; + } + + // Type C — per-bigram total-contribution cap. Only applies + // when we have enough distinct bigrams that capping any single + // one won't destroy a large fraction of the discriminative + // signal. Below the floor, short-probe semantics rule: every + // bigram counts fully. + boolean applyCap = distinctIdx >= MIN_DISTINCT_FOR_CAP; + + // Pass 2: per distinct bigram, compute per-class total + // contribution and (when above floor) apply Type C cap. double[] score = new double[numClasses]; - for (int c = 0; c < numClasses; c++) { - score[c] = dots[c] * perClassDequant[c]; + double[] contributions = new double[numClasses]; + for (int k = 0; k < distinctIdx; k++) { + int bigram = distinctBigrams[k]; + int n = count[bigram]; + int w = idf8[bigram]; + double countTimesIdf = (double) n * w; + int base = bigram * numClasses; + + if (!applyCap) { + // Fast path: no cap, just accumulate. + for (int c = 0; c < numClasses; c++) { + score[c] += logP8[base + c] * countTimesIdf * perClassDequant[c]; + } + continue; + } + + // logPs are negative; "best" class for the bigram = highest + // (least negative) contribution after dequant. + double max = Double.NEGATIVE_INFINITY; + double secondMax = Double.NEGATIVE_INFINITY; + for (int c = 0; c < numClasses; c++) { + double contrib = logP8[base + c] * countTimesIdf * perClassDequant[c]; + contributions[c] = contrib; + if (contrib > max) { + secondMax = max; + max = contrib; + } else if (contrib > secondMax) { + secondMax = contrib; + } + } + // Cap any class whose contribution exceeds runner-up + cap. + double cap = secondMax + CAP_PER_BIGRAM_NATS; + if (max > cap) { + for (int c = 0; c < numClasses; c++) { + if (contributions[c] > cap) { + contributions[c] = cap; + } + } + } + for (int c = 0; c < numClasses; c++) { + score[c] += contributions[c]; + } } return new ScoreResult(score, scored, total); } diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin index c71475ebaa..bcfce41d67 100644 Binary files a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin and b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin differ diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java new file mode 100644 index 0000000000..28b027f47c --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Test; + +public class HtmlByteStripperTest { + + /** Helper: strip a string and return the (text, tagCount, entityCount) tuple. */ + private static StripOutcome strip(String input) { + byte[] src = input.getBytes(StandardCharsets.US_ASCII); + byte[] dst = new byte[src.length]; + HtmlByteStripper.Result r = HtmlByteStripper.strip(src, 0, src.length, dst, 0); + return new StripOutcome(new String(dst, 0, r.length, StandardCharsets.US_ASCII), + r.tagCount, r.entityCount); + } + + private static final class StripOutcome { + final String text; + final int tagCount; + final int entityCount; + StripOutcome(String text, int tagCount, int entityCount) { + this.text = text; + this.tagCount = tagCount; + this.entityCount = entityCount; + } + } + + @Test + public void namedEntityIsStripped() { + StripOutcome r = strip("hello & world"); + assertEquals("hello world", r.text); + assertEquals(1, r.entityCount); + assertEquals(0, r.tagCount); + } + + @Test + public void decimalNumericEntityIsStripped() { + StripOutcome r = strip("foo © bar"); + assertEquals("foo bar", r.text); + assertEquals(1, r.entityCount); + } + + @Test + public void hexNumericEntityIsStripped() { + StripOutcome r = strip("foo © bar"); + assertEquals("foo bar", r.text); + assertEquals(1, r.entityCount); + } + + @Test + public void hexNumericEntityUppercaseXIsStripped() { + StripOutcome r = strip("a©b"); + assertEquals("ab", r.text); + assertEquals(1, r.entityCount); + } + + @Test + public void ampersandFollowedByLetterWithoutSemicolonIsLiteral() { + // AT&T pattern: & followed by letter(s) but no closing ';' + StripOutcome r = strip("AT&T Inc"); + assertEquals("AT&T Inc", r.text); + assertEquals(0, r.entityCount); + } + + @Test + public void ampersandFollowedByNonLetterIsLiteral() { + // Q&A pattern: & followed by uppercase letter then space → bailout + StripOutcome r = strip("Q&A session"); + assertEquals("Q&A session", r.text); + assertEquals(0, r.entityCount); + } + + @Test + public void ampersandFollowedBySpaceIsLiteral() { + StripOutcome r = strip("a & b"); + assertEquals("a & b", r.text); + assertEquals(0, r.entityCount); + } + + @Test + public void ampersandAtEndOfInputIsLiteral() { + StripOutcome r = strip("end&"); + assertEquals("end&", r.text); + assertEquals(0, r.entityCount); + } + + @Test + public void unclosedEntityNameAtEndOfInputIsLiteral() { + StripOutcome r = strip("end&foo"); + assertEquals("end&foo", r.text); + assertEquals(0, r.entityCount); + } + + @Test + public void unclosedNumericEntityAtEndOfInputIsLiteral() { + StripOutcome r = strip("end{"); + assertEquals("end{", r.text); + assertEquals(0, r.entityCount); + } + + @Test + public void entityExceedingLengthCapIsLiteral() { + // Standard HTML5 entity longer than the 16-byte cap. + StripOutcome r = strip("x∳y"); + // The cap kicks in mid-body; the consumed prefix is emitted as + // literal text, then the rest of the bytes follow as text. + // Exact prefix depends on cap; key assertion is the entity was + // NOT counted as stripped. + assertEquals(0, r.entityCount); + // The full input is preserved as text (cap bailout emits what + // it consumed, and the remaining tail follows naturally). + assertEquals("x∳y", r.text); + } + + @Test + public void adjacentEntitiesAreAllStripped() { + StripOutcome r = strip("&&&"); + assertEquals("", r.text); + assertEquals(3, r.entityCount); + } + + @Test + public void ampersandCascadingIntoTagWorks() { + // & followed by letters then '<' should emit the bailout prefix + // and then transition into tag-stripping. + StripOutcome r = strip("a&foo<b>c"); + assertEquals("a&fooc", r.text); + assertEquals(0, r.entityCount); + assertEquals(1, r.tagCount); + } + + @Test + public void ampersandCascadingIntoAnotherEntity() { + // & followed by non-entity content then another & — the + // first '&' should emit literal, second '&' starts a new entity. + StripOutcome r = strip("a&!&b"); + assertEquals("a&!b", r.text); + assertEquals(1, r.entityCount); + } + + @Test + public void tagWithEntityInBodyStripsBoth() { + StripOutcome r = strip("<p>hello world</p>"); + assertEquals("helloworld", r.text); + assertEquals(2, r.tagCount); + assertEquals(1, r.entityCount); + } + + @Test + public void plainTextNoMarkup() { + StripOutcome r = strip("just plain text, no markup at all"); + assertEquals("just plain text, no markup at all", r.text); + assertEquals(0, r.tagCount); + assertEquals(0, r.entityCount); + } + + @Test + public void emptyEntityIsLiteral() { + // "&;" — & followed immediately by ';' (not letter / not '#') + StripOutcome r = strip("a&;b"); + assertEquals("a&;b", r.text); + assertEquals(0, r.entityCount); + } + + @Test + public void numericEmptyBodyIsLiteral() { + // "&#;" — &# followed by ';' (not 'x' / not digit) + StripOutcome r = strip("a&#;b"); + assertEquals("a&#;b", r.text); + assertEquals(0, r.entityCount); + } +} diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java index db9346eb90..99017e7a3f 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java @@ -21,6 +21,7 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.List; import java.util.Locale; @@ -39,6 +40,10 @@ public final class TraceMojibuster { public static void main(String[] args) throws Exception { Path probeDir = null; String[] probes = null; + Path inlineFile = null; + byte[] inlineBytes = null; + boolean fullRanking = false; + int rankingTopN = 25; for (int i = 0; i < args.length; i++) { switch (args[i]) { case "--probe-dir": @@ -48,13 +53,30 @@ public final class TraceMojibuster { case "--probes": probes = args[++i].split(","); break; + case "--file": + inlineFile = Paths.get(args[++i].replaceFirst("^~", + System.getProperty("user.home"))); + break; + case "--bytes-hex": + inlineBytes = decodeHex(args[++i]); + break; + case "--full-ranking": + fullRanking = true; + break; + case "--ranking-top-n": + rankingTopN = Integer.parseInt(args[++i]); + break; default: System.err.println("Unknown arg: " + args[i]); System.exit(1); } } - if (probeDir == null || probes == null) { - System.err.println("Usage: TraceMojibuster --probe-dir <dir> --probes p1,p2,..."); + boolean hasInline = inlineFile != null || inlineBytes != null; + boolean hasProbeList = probeDir != null && probes != null; + if (!hasInline && !hasProbeList) { + System.err.println("Usage: TraceMojibuster" + + " (--probe-dir <dir> --probes p1,p2,... | --file <path> | --bytes-hex <hex>)" + + " [--full-ranking] [--ranking-top-n N]"); System.exit(1); } // Load the bundled model from the classpath (same path Mojibuster uses). @@ -67,33 +89,62 @@ public final class TraceMojibuster { } MojibusterEncodingDetector det = new MojibusterEncodingDetector(); - for (String pid : probes) { - Path p = probeDir.resolve(pid); - if (!Files.exists(p)) { - System.err.println("Missing: " + p); - continue; + // Build the (label, bytes) work list. Inline probes (--file / + // --bytes-hex) are processed first; then any --probes from + // --probe-dir. + List<ProbeInput> work = new ArrayList<>(); + if (inlineFile != null) { + if (!Files.exists(inlineFile)) { + System.err.println("Missing: " + inlineFile); + System.exit(1); } - byte[] bytes = Files.readAllBytes(p); - String shortId = pid.contains("/") - ? pid.substring(pid.indexOf('/') + 1, pid.indexOf('/') + 13) : pid; + work.add(new ProbeInput(inlineFile.getFileName().toString(), + Files.readAllBytes(inlineFile))); + } + if (inlineBytes != null) { + work.add(new ProbeInput("inline-hex(" + inlineBytes.length + "B)", + inlineBytes)); + } + if (probes != null) { + for (String pid : probes) { + Path p = probeDir.resolve(pid); + if (!Files.exists(p)) { + System.err.println("Missing: " + p); + continue; + } + String shortId = pid.contains("/") + ? pid.substring(pid.indexOf('/') + 1, pid.indexOf('/') + 13) : pid; + work.add(new ProbeInput(shortId, Files.readAllBytes(p))); + } + } + + for (ProbeInput w : work) { + byte[] bytes = w.bytes; System.out.println(); - System.out.println("==== " + shortId + " raw=" + bytes.length + " bytes ===="); + System.out.println("==== " + w.label + " raw=" + bytes.length + " bytes ===="); // Layer 1: raw NB on raw bytes (no strip). List<EncodingResult> rawResults = rawNb.detect(bytes); System.out.println(" raw NB (no strip): " + fmt(rawResults)); + if (fullRanking) { + dumpFullRanking(rawNb, bytes, "raw", rankingTopN); + } // Layer 2: NB on HTML-stripped bytes. byte[] dst = new byte[bytes.length]; HtmlByteStripper.Result sr = HtmlByteStripper.strip(bytes, 0, bytes.length, dst, 0); + byte[] strippedView = null; if (sr.tagCount >= 1) { - byte[] stripped = new byte[sr.length]; - System.arraycopy(dst, 0, stripped, 0, sr.length); + strippedView = new byte[sr.length]; + System.arraycopy(dst, 0, strippedView, 0, sr.length); System.out.printf(Locale.ROOT, " HTML strip: tags=%d, post-strip=%d bytes (%.1f%% kept)%n", sr.tagCount, sr.length, 100.0 * sr.length / bytes.length); - List<EncodingResult> stripResults = rawNb.detect(stripped); + List<EncodingResult> stripResults = rawNb.detect(strippedView); System.out.println(" NB on stripped bytes: " + fmt(stripResults)); + if (fullRanking) { + dumpFullRanking(rawNb, strippedView, "strip", rankingTopN); + } } else { System.out.println(" HTML strip: tagCount=0 (backoff, used original)"); } @@ -104,6 +155,69 @@ public final class TraceMojibuster { } } + /** + * Print every class sorted by raw NB log-score for this probe. + * Shows where the true charset actually ranks before margin gating, + * gap-from-top-1 in nats, and gap-per-scored-bigram (the unit the + * margin gate uses). An "emit?" column flags which candidates would + * pass the {@link NaiveBayesBigramEncodingDetector#MARGIN_THRESHOLD_NATS_PER_BIGRAM} + * gate. + */ + private static void dumpFullRanking(NaiveBayesBigramEncodingDetector nb, + byte[] probe, String layer, int topN) { + NaiveBayesBigramEncodingDetector.ScoreResult sr = nb.scoreClassesAndCount(probe); + if (sr == null) { + System.out.println(" [full-ranking " + layer + "] <probe too short to score>"); + return; + } + String[] labels = nb.getLabels(); + int n = labels.length; + Integer[] idx = new Integer[n]; + for (int i = 0; i < n; i++) idx[i] = i; + final double[] scores = sr.scores; + java.util.Arrays.sort(idx, (a, b) -> Double.compare(scores[b], scores[a])); + double top1 = scores[idx[0]]; + double marginNats = NaiveBayesBigramEncodingDetector.MARGIN_THRESHOLD_NATS_PER_BIGRAM + * Math.max(1, sr.scoredBigrams); + System.out.printf(Locale.ROOT, + " [full-ranking %s] scoredBigrams=%d totalBigrams=%d marginGate=%.3f nats (%.3f×bg)%n", + layer, sr.scoredBigrams, sr.totalBigrams, marginNats, + NaiveBayesBigramEncodingDetector.MARGIN_THRESHOLD_NATS_PER_BIGRAM); + int limit = Math.min(topN, n); + for (int rank = 0; rank < limit; rank++) { + int c = idx[rank]; + double score = scores[c]; + double gap = top1 - score; + double gapPerBg = (sr.scoredBigrams > 0) ? gap / sr.scoredBigrams : Double.NaN; + String emit = (rank == 0) ? "top1" + : (gap < marginNats ? "EMIT" : "----"); + System.out.printf(Locale.ROOT, + " #%2d %-18s score=%+11.3f gap=%+8.3f gap/bg=%+.4f %s%n", + rank + 1, labels[c], score, gap, gapPerBg, emit); + } + } + + private static byte[] decodeHex(String s) { + String cleaned = s.replaceAll("[\\s,:]", ""); + if (cleaned.length() % 2 != 0) { + throw new IllegalArgumentException("hex string must have even length: " + s); + } + byte[] out = new byte[cleaned.length() / 2]; + for (int i = 0; i < out.length; i++) { + out[i] = (byte) Integer.parseInt(cleaned.substring(2 * i, 2 * i + 2), 16); + } + return out; + } + + private static final class ProbeInput { + final String label; + final byte[] bytes; + ProbeInput(String label, byte[] bytes) { + this.label = label; + this.bytes = bytes; + } + } + private static String fmt(List<EncodingResult> rs) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < rs.size(); i++) { diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java index 43eecdc5da..a082f4c5be 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java @@ -37,6 +37,8 @@ import java.util.Set; import java.util.TreeMap; import java.util.zip.GZIPInputStream; +import org.apache.tika.ml.chardetect.HtmlByteStripper; + /** * Naive-Bayes byte-bigram charset classifier trainer. * @@ -80,6 +82,10 @@ public class TrainNaiveBayesBigram { private static final double DEFAULT_ALPHA_BASE = 1.0; private static final int DEFAULT_MAX_SAMPLES = 50_000; + /** Match {@code MojibusterEncodingDetector.MIN_TAG_COUNT_TO_USE_STRIP}. */ + private static final int MIN_TAG_COUNT_TO_USE_STRIP = 1; + /** Match {@code MojibusterEncodingDetector.MIN_ENTITY_COUNT_TO_USE_STRIP}. */ + private static final int MIN_ENTITY_COUNT_TO_USE_STRIP = 3; /** * Default per-class vocabulary coverage. 0.999 means the top-K * most frequent bigrams covering 99.9% of the class's marginal @@ -192,7 +198,7 @@ public class TrainNaiveBayesBigram { } System.out.printf(Locale.ROOT, - "coverage=%.3f alpha-base=%.3f max-samples/class=%,d%n", + "coverage=%.3f alpha-base=%.3f max-samples/class=%,d (markup-stripped via HtmlByteStripper)%n", coverage, alphaBase, maxSamples); System.out.println("Classes (" + classFilter.size() + "): " + new java.util.TreeSet<>(classFilter)); @@ -244,8 +250,10 @@ public class TrainNaiveBayesBigram { // across contributing files so a 2-file class (UTF-16 LE+BE) // doesn't overrun a 1-file class's sample count. int perFileBudget = Math.max(1, maxSamples / entry.getValue().size()); + long samplesStripped = 0; for (Path f : entry.getValue()) { int fileSamples = 0; + byte[] stripBuf = new byte[65536]; try (InputStream fis = new FileInputStream(f.toFile()); GZIPInputStream gis = new GZIPInputStream(fis); DataInputStream dis = new DataInputStream(gis)) { @@ -258,8 +266,30 @@ public class TrainNaiveBayesBigram { } byte[] sample = new byte[len]; dis.readFully(sample); - for (int i = 0; i + 1 < sample.length; i++) { - int bigram = ((sample[i] & 0xFF) << 8) | (sample[i + 1] & 0xFF); + // Train/inference symmetry: same HtmlByteStripper + + // same gate as MojibusterEncodingDetector at + // detect-time. Hardcoded ON to prevent accidental + // misconfiguration; the bigram tables MUST reflect + // the byte distribution NB will actually score. + if (stripBuf.length < len) { + stripBuf = new byte[len]; + } + HtmlByteStripper.Result r = HtmlByteStripper.strip( + sample, 0, len, stripBuf, 0); + byte[] scoreBytes; + int scoreLen; + if (r.tagCount >= MIN_TAG_COUNT_TO_USE_STRIP + || r.entityCount >= MIN_ENTITY_COUNT_TO_USE_STRIP) { + scoreBytes = stripBuf; + scoreLen = r.length; + samplesStripped++; + } else { + scoreBytes = sample; + scoreLen = len; + } + for (int i = 0; i + 1 < scoreLen; i++) { + int bigram = ((scoreBytes[i] & 0xFF) << 8) + | (scoreBytes[i + 1] & 0xFF); counts[bigram]++; totalBigrams++; } @@ -271,10 +301,11 @@ public class TrainNaiveBayesBigram { countsPerClass[ci] = counts; totalsPerClass[ci] = totalBigrams; System.out.printf(Locale.ROOT, - " counted %-20s %,7d samples %,10d total bigrams (%d file%s)%n", + " counted %-20s %,7d samples %,10d total bigrams (%d file%s) stripped=%,d%n", labels[ci], numSamples, totalBigrams, entry.getValue().size(), - entry.getValue().size() == 1 ? "" : "s"); + entry.getValue().size() == 1 ? "" : "s", + samplesStripped); ci++; }
