This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4731-common-script in repository https://gitbox.apache.org/repos/asf/tika.git
commit 898652ae7c0d9324fcf4d9ba2f3311173f805184 Author: tallison <[email protected]> AuthorDate: Tue May 26 15:12:47 2026 -0400 TIKA-4731 - whitespace cleanup, take 32 --- .../apache/tika/ml/junkdetect/JunkDetector.java | 63 ++++++++++---------- .../tika/ml/junkdetect/TextQualityFeatures.java | 64 ++++++++++++++++++--- .../org/apache/tika/ml/junkdetect/junkdetect.bin | Bin 2317582 -> 2321862 bytes 3 files changed, 86 insertions(+), 41 deletions(-) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java index c8c1c90cac..c231494d60 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java @@ -1123,31 +1123,6 @@ public final class JunkDetector implements TextQualityDetector { return Character.isDigit(cp); } - /** Whitespace-equivalents collapse to canonical U+0020 (' ') before bigram - * lookup. Tab, NBSP, ideographic space, line/paragraph separators, and - * ASCII control chars (\\n, \\r, etc.) all signal the SAME thing about - * surrounding script context — "end of word" — and shouldn't produce - * distinct bigram entries. Applied identically in training and inference - * via {@link #forEachScriptBigram}, so the bigram table only ever sees - * the canonical form. The follow-on (space, space) drop in - * {@link #forEachScriptBigram} then ensures collapsed whitespace runs - * (HtmlContentCleaner indentation residue: \\t\\t, \\n\\r, NBSP·\\t) - * contribute nothing to the COMMON bucket — only (script, space) and - * (space, script) boundary bigrams survive. */ - static int normalizeWhitespace(int cp) { - if (cp == 0x20) { - return cp; - } - int t = Character.getType(cp); - if (t == Character.CONTROL - || t == Character.SPACE_SEPARATOR - || t == Character.LINE_SEPARATOR - || t == Character.PARAGRAPH_SEPARATOR) { - return 0x20; - } - return cp; - } - /** * Enumerates the script-bucketed bigrams of {@code cps} under the redesign * representation (see block comment above). Used identically by inference @@ -1179,6 +1154,36 @@ public final class JunkDetector implements TextQualityDetector { return buckets; } + /** "Real" structural whitespace collapses to canonical U+0020 before bigram + * emission. Matches {@link #computeZ3ControlByte}'s definition of + * non-anomalous whitespace: HT (0x09), LF (0x0A), CR (0x0D), regular + * space (0x20), plus the Zs/Zl/Zp Unicode categories (NBSP, ideographic + * space, line/paragraph separators). + * + * <p><strong>Anomalous Cc (0x01-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F, U+0085 + * NEL, U+0080-0x009F C1 controls) and Cf (format chars) are DELIBERATELY + * NOT normalized.</strong> Their OOV-floor signal is carrying real + * evidence that the decode is wrong — e.g., windows-1252 bytes 0x80-0x9F + * decode to printable curly quotes / em-dashes; ISO-8859-16 misdecodes + * them as C1 control codepoints; the bigram-table OOV-floor on those + * Cc-touching bigrams is what correctly penalizes the wrong decode. + * z3 has had this distinction since v15; this brings z1 in line. */ + static int normalizeWhitespace(int cp) { + if (cp == 0x20) { + return cp; + } + if (cp == 0x09 || cp == 0x0A || cp == 0x0D) { + return 0x20; + } + int t = Character.getType(cp); + if (t == Character.SPACE_SEPARATOR + || t == Character.LINE_SEPARATOR + || t == Character.PARAGRAPH_SEPARATOR) { + return 0x20; + } + return cp; + } + public static void forEachScriptBigram(int[] cps, BigramSink sink) { if (cps == null || cps.length < 2) { return; @@ -1186,13 +1191,7 @@ public final class JunkDetector implements TextQualityDetector { for (int i = 0; i + 1 < cps.length; i++) { int a = normalizeWhitespace(cps[i]); int b = normalizeWhitespace(cps[i + 1]); - // (space, space) drop short-circuits ahead of the digit check: - // after normalization, every whitespace run (\t\t, \n\r, NBSP·\t, - // HtmlContentCleaner indentation salads) collapses to this one - // bigram class — high count on HTML, zero script-boundary signal. - // (script, space) / (space, script) boundary bigrams survive. - if ((a == 0x20 && b == 0x20) - || isSkipCodepoint(a) || isSkipCodepoint(b)) { + if (isSkipCodepoint(a) || isSkipCodepoint(b)) { continue; } String ka = classKey(a); diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java index adb5b89789..25f07b765f 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java @@ -201,27 +201,73 @@ public final class TextQualityFeatures { } /** - * z8: fraction of codepoints that are the Unicode REPLACEMENT CHARACTER - * (U+FFFD). Direct decode-failure signal — Java's CharsetDecoder emits - * one U+FFFD per malformed/unmappable byte. Continuous (not a binary - * threshold) so the JunkDetector LR can learn a proportional weight on - * it rather than vetoing decodes that happen to contain any. + * z6 raw input: fraction of codepoints that are "anomaly indicators" — + * codepoints that shouldn't appear in correctly-decoded natural text. + * Direct decode-failure signal — wrong encodings produce these in bulk + * (Java's CharsetDecoder emits U+FFFD per malformed byte; ISO-8859-X + * misreads windows-1252 high bytes as C1 controls; PDF cmap failures + * emit private-use codepoints; etc.). + * + * <p>Anomaly set: + * <ul> + * <li>U+FFFD (REPLACEMENT CHARACTER) — the direct decode-failure marker + * <li>Anomalous Cc: {@code 0x01-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F} + * (matching z3's byte-level anomaly definition, at codepoint level) + * <li>C1 control codepoints: {@code U+0080-U+009F} — the + * ISO-8859-X-misdecodes-windows-1252 signal + * <li>Private use area: {@code U+E000-U+F8FF}, plus planes 15-16 PUA — + * the PDF cmap-failure signal + * </ul> + * + * <p>Continuous (not a binary threshold) so the JunkDetector combiner LR + * can learn a proportional weight on it. Excluded: 0x00 (NUL — can + * occur legitimately in some text streams; matches z3's exclusion); + * 0x09/0x0A/0x0D/0x20 (legitimate whitespace); Cf format chars (ZWJ etc. + * have legitimate linguistic uses); Cn unassigned (rare in practice). */ public static double replacementRatio(String text) { if (text == null || text.isEmpty()) { return Double.NaN; } int total = 0; - int replacements = 0; + int anomaly = 0; for (int i = 0; i < text.length(); ) { int cp = text.codePointAt(i); i += Character.charCount(cp); total++; - if (cp == 0xFFFD) { - replacements++; + if (isAnomalyCodepoint(cp)) { + anomaly++; } } - return total == 0 ? Double.NaN : (double) replacements / total; + return total == 0 ? Double.NaN : (double) anomaly / total; + } + + /** True if {@code cp} is in the z6 anomaly set: U+FFFD, anomalous Cc + * (matching z3 byte-level definition), C1 controls, or private use. */ + static boolean isAnomalyCodepoint(int cp) { + if (cp == 0xFFFD) { + return true; + } + // Anomalous Cc (excludes 0x00, 0x09, 0x0A, 0x0D — match z3) + if ((cp >= 0x01 && cp <= 0x08) + || cp == 0x0B || cp == 0x0C + || (cp >= 0x0E && cp <= 0x1F) + || cp == 0x7F) { + return true; + } + // C1 controls — the ISO-8859-X-misreads-windows-1252 signal + if (cp >= 0x0080 && cp <= 0x009F) { + return true; + } + // Private use area (BMP) + if (cp >= 0xE000 && cp <= 0xF8FF) { + return true; + } + // Supplementary PUA (planes 15 and 16) + if (cp >= 0xF0000 && cp <= 0x10FFFD) { + return true; + } + return false; } /** diff --git a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin index c5a88fa093..a83dd647e6 100644 Binary files a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin and b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin differ
