(tika) 09/09: TIKA-4731 - whitespace cleanup, take 32

tallison Tue, 26 May 2026 12:19:09 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4731-common-script
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 898652ae7c0d9324fcf4d9ba2f3311173f805184
Author: tallison <[email protected]>
AuthorDate: Tue May 26 15:12:47 2026 -0400

    TIKA-4731 - whitespace cleanup, take 32
---
 .../apache/tika/ml/junkdetect/JunkDetector.java    |  63 ++++++++++----------
 .../tika/ml/junkdetect/TextQualityFeatures.java    |  64 ++++++++++++++++++---
 .../org/apache/tika/ml/junkdetect/junkdetect.bin   | Bin 2317582 -> 2321862 
bytes
 3 files changed, 86 insertions(+), 41 deletions(-)

diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
index c8c1c90cac..c231494d60 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java
@@ -1123,31 +1123,6 @@ public final class JunkDetector implements 
TextQualityDetector {
         return Character.isDigit(cp);
     }
 
-    /** Whitespace-equivalents collapse to canonical U+0020 (' ') before bigram
-     *  lookup.  Tab, NBSP, ideographic space, line/paragraph separators, and
-     *  ASCII control chars (\\n, \\r, etc.) all signal the SAME thing about
-     *  surrounding script context — "end of word" — and shouldn't produce
-     *  distinct bigram entries.  Applied identically in training and inference
-     *  via {@link #forEachScriptBigram}, so the bigram table only ever sees
-     *  the canonical form.  The follow-on (space, space) drop in
-     *  {@link #forEachScriptBigram} then ensures collapsed whitespace runs
-     *  (HtmlContentCleaner indentation residue: \\t\\t, \\n\\r, NBSP·\\t)
-     *  contribute nothing to the COMMON bucket — only (script, space) and
-     *  (space, script) boundary bigrams survive. */
-    static int normalizeWhitespace(int cp) {
-        if (cp == 0x20) {
-            return cp;
-        }
-        int t = Character.getType(cp);
-        if (t == Character.CONTROL
-                || t == Character.SPACE_SEPARATOR
-                || t == Character.LINE_SEPARATOR
-                || t == Character.PARAGRAPH_SEPARATOR) {
-            return 0x20;
-        }
-        return cp;
-    }
-
     /**
      * Enumerates the script-bucketed bigrams of {@code cps} under the redesign
      * representation (see block comment above).  Used identically by inference
@@ -1179,6 +1154,36 @@ public final class JunkDetector implements 
TextQualityDetector {
         return buckets;
     }
 
+    /** "Real" structural whitespace collapses to canonical U+0020 before 
bigram
+     *  emission.  Matches {@link #computeZ3ControlByte}'s definition of
+     *  non-anomalous whitespace: HT (0x09), LF (0x0A), CR (0x0D), regular
+     *  space (0x20), plus the Zs/Zl/Zp Unicode categories (NBSP, ideographic
+     *  space, line/paragraph separators).
+     *
+     *  <p><strong>Anomalous Cc (0x01-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F, U+0085
+     *  NEL, U+0080-0x009F C1 controls) and Cf (format chars) are DELIBERATELY
+     *  NOT normalized.</strong>  Their OOV-floor signal is carrying real
+     *  evidence that the decode is wrong — e.g., windows-1252 bytes 0x80-0x9F
+     *  decode to printable curly quotes / em-dashes; ISO-8859-16 misdecodes
+     *  them as C1 control codepoints; the bigram-table OOV-floor on those
+     *  Cc-touching bigrams is what correctly penalizes the wrong decode.
+     *  z3 has had this distinction since v15; this brings z1 in line. */
+    static int normalizeWhitespace(int cp) {
+        if (cp == 0x20) {
+            return cp;
+        }
+        if (cp == 0x09 || cp == 0x0A || cp == 0x0D) {
+            return 0x20;
+        }
+        int t = Character.getType(cp);
+        if (t == Character.SPACE_SEPARATOR
+                || t == Character.LINE_SEPARATOR
+                || t == Character.PARAGRAPH_SEPARATOR) {
+            return 0x20;
+        }
+        return cp;
+    }
+
     public static void forEachScriptBigram(int[] cps, BigramSink sink) {
         if (cps == null || cps.length < 2) {
             return;
@@ -1186,13 +1191,7 @@ public final class JunkDetector implements 
TextQualityDetector {
         for (int i = 0; i + 1 < cps.length; i++) {
             int a = normalizeWhitespace(cps[i]);
             int b = normalizeWhitespace(cps[i + 1]);
-            // (space, space) drop short-circuits ahead of the digit check:
-            // after normalization, every whitespace run (\t\t, \n\r, NBSP·\t,
-            // HtmlContentCleaner indentation salads) collapses to this one
-            // bigram class — high count on HTML, zero script-boundary signal.
-            // (script, space) / (space, script) boundary bigrams survive.
-            if ((a == 0x20 && b == 0x20)
-                    || isSkipCodepoint(a) || isSkipCodepoint(b)) {
+            if (isSkipCodepoint(a) || isSkipCodepoint(b)) {
                 continue;
             }
             String ka = classKey(a);
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
index adb5b89789..25f07b765f 100644
--- 
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
+++ 
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/TextQualityFeatures.java
@@ -201,27 +201,73 @@ public final class TextQualityFeatures {
     }
 
     /**
-     * z8: fraction of codepoints that are the Unicode REPLACEMENT CHARACTER
-     * (U+FFFD).  Direct decode-failure signal — Java's CharsetDecoder emits
-     * one U+FFFD per malformed/unmappable byte.  Continuous (not a binary
-     * threshold) so the JunkDetector LR can learn a proportional weight on
-     * it rather than vetoing decodes that happen to contain any.
+     * z6 raw input: fraction of codepoints that are "anomaly indicators" —
+     * codepoints that shouldn't appear in correctly-decoded natural text.
+     * Direct decode-failure signal — wrong encodings produce these in bulk
+     * (Java's CharsetDecoder emits U+FFFD per malformed byte; ISO-8859-X
+     * misreads windows-1252 high bytes as C1 controls; PDF cmap failures
+     * emit private-use codepoints; etc.).
+     *
+     * <p>Anomaly set:
+     * <ul>
+     *   <li>U+FFFD (REPLACEMENT CHARACTER) — the direct decode-failure marker
+     *   <li>Anomalous Cc: {@code 0x01-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F}
+     *       (matching z3's byte-level anomaly definition, at codepoint level)
+     *   <li>C1 control codepoints: {@code U+0080-U+009F} — the
+     *       ISO-8859-X-misdecodes-windows-1252 signal
+     *   <li>Private use area: {@code U+E000-U+F8FF}, plus planes 15-16 PUA —
+     *       the PDF cmap-failure signal
+     * </ul>
+     *
+     * <p>Continuous (not a binary threshold) so the JunkDetector combiner LR
+     * can learn a proportional weight on it.  Excluded: 0x00 (NUL — can
+     * occur legitimately in some text streams; matches z3's exclusion);
+     * 0x09/0x0A/0x0D/0x20 (legitimate whitespace); Cf format chars (ZWJ etc.
+     * have legitimate linguistic uses); Cn unassigned (rare in practice).
      */
     public static double replacementRatio(String text) {
         if (text == null || text.isEmpty()) {
             return Double.NaN;
         }
         int total = 0;
-        int replacements = 0;
+        int anomaly = 0;
         for (int i = 0; i < text.length(); ) {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
             total++;
-            if (cp == 0xFFFD) {
-                replacements++;
+            if (isAnomalyCodepoint(cp)) {
+                anomaly++;
             }
         }
-        return total == 0 ? Double.NaN : (double) replacements / total;
+        return total == 0 ? Double.NaN : (double) anomaly / total;
+    }
+
+    /** True if {@code cp} is in the z6 anomaly set: U+FFFD, anomalous Cc
+     *  (matching z3 byte-level definition), C1 controls, or private use. */
+    static boolean isAnomalyCodepoint(int cp) {
+        if (cp == 0xFFFD) {
+            return true;
+        }
+        // Anomalous Cc (excludes 0x00, 0x09, 0x0A, 0x0D — match z3)
+        if ((cp >= 0x01 && cp <= 0x08)
+                || cp == 0x0B || cp == 0x0C
+                || (cp >= 0x0E && cp <= 0x1F)
+                || cp == 0x7F) {
+            return true;
+        }
+        // C1 controls — the ISO-8859-X-misreads-windows-1252 signal
+        if (cp >= 0x0080 && cp <= 0x009F) {
+            return true;
+        }
+        // Private use area (BMP)
+        if (cp >= 0xE000 && cp <= 0xF8FF) {
+            return true;
+        }
+        // Supplementary PUA (planes 15 and 16)
+        if (cp >= 0xF0000 && cp <= 0x10FFFD) {
+            return true;
+        }
+        return false;
     }
 
     /**
diff --git 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
index c5a88fa093..a83dd647e6 100644
Binary files 
a/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 and 
b/tika-ml/tika-ml-junkdetect/src/main/resources/org/apache/tika/ml/junkdetect/junkdetect.bin
 differ

(tika) 09/09: TIKA-4731 - whitespace cleanup, take 32

Reply via email to