(tika) 01/09: TIKA-4731 - strip html entities, rebalance training, add modest clipping

tallison Tue, 26 May 2026 12:19:53 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4731-common-script
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 02e2f9b9d550024c6be105f4d03be269faf0c7ce
Author: tallison <[email protected]>
AuthorDate: Tue May 19 11:34:07 2026 -0400

    TIKA-4731 - strip html entities, rebalance training, add modest clipping
---
 .../tika/ml/chardetect/HtmlByteStripper.java       | 171 +++++++++++++++++-
 .../ml/chardetect/MojibusterEncodingDetector.java  |  18 +-
 .../NaiveBayesBigramEncodingDetector.java          | 170 +++++++++++++-----
 .../org/apache/tika/ml/chardetect/nb-bigram.bin    | Bin 1020112 -> 1016638 
bytes
 .../tika/ml/chardetect/HtmlByteStripperTest.java   | 191 +++++++++++++++++++++
 .../apache/tika/ml/chardetect/TraceMojibuster.java | 142 +++++++++++++--
 .../ml/chardetect/tools/TrainNaiveBayesBigram.java |  41 ++++-
 7 files changed, 662 insertions(+), 71 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
index 1443ae9723..09768e0977 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
@@ -66,6 +66,29 @@ public final class HtmlByteStripper {
      *  end at the next {@code >}. Internal subsets ({@code <!DOCTYPE foo [ 
... ]>})
      *  are rare; we'd stop at the first nested {@code >}. Acceptable. */
     private static final int DECL_OR_PI = 10;
+    /** Just saw {@code &}.  Next byte decides whether this is a named
+     *  entity ({@code &amp;}), a numeric reference ({@code &#169;} or
+     *  {@code &#xA9;}), or a stray ampersand. */
+    private static final int ENTITY = 11;
+    private static final int ENTITY_NAME = 12;
+    /** Just saw {@code &#}.  Next byte: {@code x}/{@code X} for hex, or
+     *  decimal digit for decimal. */
+    private static final int ENTITY_NUM = 13;
+    private static final int ENTITY_DEC = 14;
+    private static final int ENTITY_HEX = 15;
+
+    /**
+     * Maximum body length for an entity (bytes after {@code &}, including
+     * any {@code #}/{@code x} prefix, excluding the trailing {@code ;}).
+     * Standard HTML5 named entities are at most 32 bytes
+     * ({@code &CounterClockwiseContourIntegral;}), but only a few dozen
+     * exceed 16; 99% of real-world entities are well under that.  A 16-byte
+     * cap covers the common cases ({@code &nbsp;}, {@code &laquo;},
+     * {@code &Aacute;}, {@code &hellip;}, {@code &middot;}, etc.) and
+     * bounds pathological input that might otherwise eat language-content
+     * bytes before bailing out.
+     */
+    private static final int MAX_ENTITY_BODY_LEN = 16;
 
     private static final byte[] SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
     private static final byte[] STYLE = {'s', 't', 'y', 'l', 'e'};
@@ -105,10 +128,13 @@ public final class HtmlByteStripper {
         public final int length;
         /** Number of well-formed tags parsed (including comments). */
         public final int tagCount;
+        /** Number of well-formed HTML entities stripped from TEXT. */
+        public final int entityCount;
 
-        public Result(int length, int tagCount) {
+        public Result(int length, int tagCount, int entityCount) {
             this.length = length;
             this.tagCount = tagCount;
+            this.entityCount = entityCount;
         }
     }
 
@@ -140,6 +166,11 @@ public final class HtmlByteStripper {
         int rawMatch = 0;
         int end = srcOffset + srcLen;
         int tagCount = 0;
+        int entityCount = 0;
+        // Position of the leading '&' for the in-progress entity.
+        // Tracked so the bailout path can emit the consumed prefix
+        // as literal text when the parse fails (e.g. "AT&T").
+        int entityStart = 0;
         int attrNameStart = 0;
         // When true, the current quoted attribute value's bytes are
         // emitted to dst (attribute name matched TEXT_ATTRS).  Reset
@@ -152,6 +183,9 @@ public final class HtmlByteStripper {
                 case TEXT:
                     if (b == '<') {
                         state = LT;
+                    } else if (b == '&') {
+                        state = ENTITY;
+                        entityStart = i;
                     } else {
                         dst[w++] = b;
                     }
@@ -189,6 +223,121 @@ public final class HtmlByteStripper {
                     }
                     break;
 
+                case ENTITY:
+                    // First byte after '&' decides path.
+                    if (b == '#') {
+                        state = ENTITY_NUM;
+                    } else if (isAsciiLetter(b)) {
+                        state = ENTITY_NAME;
+                    } else {
+                        // Not entity-shaped: emit consumed prefix (just '&')
+                        // and re-process b under TEXT semantics.
+                        for (int k = entityStart; k < i; k++) {
+                            dst[w++] = src[k];
+                        }
+                        if (b == '<') {
+                            state = LT;
+                        } else if (b == '&') {
+                            entityStart = i;
+                            // state stays ENTITY
+                        } else {
+                            dst[w++] = b;
+                            state = TEXT;
+                        }
+                    }
+                    break;
+
+                case ENTITY_NAME:
+                    if (b == ';') {
+                        entityCount++;
+                        state = TEXT;
+                    } else if (isAsciiLetter(b)
+                            && (i - entityStart) <= MAX_ENTITY_BODY_LEN) {
+                        // continue accumulating; no emit
+                    } else {
+                        for (int k = entityStart; k < i; k++) {
+                            dst[w++] = src[k];
+                        }
+                        if (b == '<') {
+                            state = LT;
+                        } else if (b == '&') {
+                            entityStart = i;
+                            state = ENTITY;
+                        } else {
+                            dst[w++] = b;
+                            state = TEXT;
+                        }
+                    }
+                    break;
+
+                case ENTITY_NUM:
+                    // First byte after '&#': 'x'/'X' for hex, digit for 
decimal.
+                    if (b == 'x' || b == 'X') {
+                        state = ENTITY_HEX;
+                    } else if (isAsciiDigit(b)) {
+                        state = ENTITY_DEC;
+                    } else {
+                        for (int k = entityStart; k < i; k++) {
+                            dst[w++] = src[k];
+                        }
+                        if (b == '<') {
+                            state = LT;
+                        } else if (b == '&') {
+                            entityStart = i;
+                            state = ENTITY;
+                        } else {
+                            dst[w++] = b;
+                            state = TEXT;
+                        }
+                    }
+                    break;
+
+                case ENTITY_DEC:
+                    if (b == ';') {
+                        entityCount++;
+                        state = TEXT;
+                    } else if (isAsciiDigit(b)
+                            && (i - entityStart) <= MAX_ENTITY_BODY_LEN) {
+                        // continue
+                    } else {
+                        for (int k = entityStart; k < i; k++) {
+                            dst[w++] = src[k];
+                        }
+                        if (b == '<') {
+                            state = LT;
+                        } else if (b == '&') {
+                            entityStart = i;
+                            state = ENTITY;
+                        } else {
+                            dst[w++] = b;
+                            state = TEXT;
+                        }
+                    }
+                    break;
+
+                case ENTITY_HEX:
+                    if (b == ';') {
+                        entityCount++;
+                        state = TEXT;
+                    } else if (isHexDigit(b)
+                            && (i - entityStart) <= MAX_ENTITY_BODY_LEN) {
+                        // continue
+                    } else {
+                        for (int k = entityStart; k < i; k++) {
+                            dst[w++] = src[k];
+                        }
+                        if (b == '<') {
+                            state = LT;
+                        } else if (b == '&') {
+                            entityStart = i;
+                            state = ENTITY;
+                        } else {
+                            dst[w++] = b;
+                            state = TEXT;
+                        }
+                    }
+                    break;
+
                 case TAG_NAME:
                     if (isTagNameTerminator(b)) {
                         int nameLen = i - nameStart;
@@ -332,7 +481,15 @@ public final class HtmlByteStripper {
             }
         }
 
-        return new Result(w - dstOffset, tagCount);
+        // Unterminated entity at EOF: emit consumed prefix as literal text.
+        if (state == ENTITY || state == ENTITY_NAME || state == ENTITY_NUM
+                || state == ENTITY_DEC || state == ENTITY_HEX) {
+            for (int k = entityStart; k < end; k++) {
+                dst[w++] = src[k];
+            }
+        }
+
+        return new Result(w - dstOffset, tagCount, entityCount);
     }
 
     /**
@@ -350,6 +507,16 @@ public final class HtmlByteStripper {
         return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
     }
 
+    private static boolean isAsciiDigit(byte b) {
+        int c = b & 0xFF;
+        return c >= '0' && c <= '9';
+    }
+
+    private static boolean isHexDigit(byte b) {
+        int c = b & 0xFF;
+        return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' 
&& c <= 'f');
+    }
+
     private static boolean isTagNameTerminator(byte b) {
         return b == ' ' || b == '\t' || b == '\n' || b == '\r' || b == '>' || 
b == '/';
     }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 806e1a2251..78dc9400ae 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -97,6 +97,15 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
      */
     private static final int MIN_TAG_COUNT_TO_USE_STRIP = 1;
 
+    /**
+     * Minimum HTML entity count to apply the stripper even when no
+     * well-formed tags are present.  A single stray {@code &amp;}
+     * mention in plain prose shouldn't trigger the strip path, but
+     * entity-heavy content (HTML-quoted text in a plain-text file,
+     * truncated reads where the leading tag was lost, etc.) should.
+     */
+    private static final int MIN_ENTITY_COUNT_TO_USE_STRIP = 3;
+
     /**
      * Confidence attached to UTF-32 structural candidates — high but
      * sub-1.0 so the ResultType.STRUCTURAL flag carries meaning
@@ -674,10 +683,11 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         byte[] dst = new byte[probe.length];
         HtmlByteStripper.Result stripped =
                 HtmlByteStripper.strip(probe, 0, probe.length, dst, 0);
-        if (stripped.tagCount < MIN_TAG_COUNT_TO_USE_STRIP) {
-            // No well-formed tags found — probe isn't markup (or the
-            // bytes don't parse as markup in any ASCII-compatible
-            // reading).  Use original.
+        if (stripped.tagCount < MIN_TAG_COUNT_TO_USE_STRIP
+                && stripped.entityCount < MIN_ENTITY_COUNT_TO_USE_STRIP) {
+            // No well-formed tags AND not enough entities to be markup —
+            // probe isn't markup (or the bytes don't parse as markup in
+            // any ASCII-compatible reading).  Use original.
             return probe;
         }
         byte[] trimmed = new byte[stripped.length];
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
index 98aae326b7..84d721bc12 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
@@ -101,7 +101,53 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
      * of how uncertain the model actually is, so it cannot serve as
      * a candidate-emission gate.</p>
      */
-    private static final double MARGIN_THRESHOLD_NATS_PER_BIGRAM = 0.20;
+    public static final double MARGIN_THRESHOLD_NATS_PER_BIGRAM = 0.20;
+
+    /**
+     * Per-bigram cross-class total-contribution cap (Type C clipping).
+     * For each distinct bigram in the probe, the top-scoring class's
+     * total contribution (count × logP × idf, after dequantization) is
+     * capped at the runner-up class's contribution + this many nats.
+     *
+     * <p>Defends against corpus-skew pathologies where one class
+     * accumulates extreme bigram mass that swings classification on
+     * one or two byte-pairs alone (e.g., Czech "ČR" digraph in
+     * ISO-8859-2 contributing +186 nats over win-1252 on Italian text).
+     * Length-invariant by construction: the cap is on per-bigram
+     * advantage, regardless of how many times the bigram appears.</p>
+     *
+     * <p>20 nats = e^20 ≈ 5×10^8 probability-ratio advantage per
+     * bigram — preserves legitimate CJK-vs-Latin and other cross-script
+     * signal while bounding the diffuse-corpus-skew tail.</p>
+     */
+    public static final double CAP_PER_BIGRAM_NATS = 20.0;
+
+    /**
+     * Minimum distinct bigrams required before the per-bigram cap
+     * applies.  On short probes, each bigram carries proportionally
+     * more signal — clipping would destroy more discrimination than
+     * it saves.
+     */
+    public static final int MIN_DISTINCT_FOR_CAP = 30;
+
+    /**
+     * Minimum distinct-bigram fraction of total-scored-bigrams.  Below
+     * this, the input is treated as degenerate (looped / repeated /
+     * corrupt) and {@link #scoreClassesAndCount(byte[])} returns
+     * {@code null} so callers can fall back.  Defends against pathological
+     * inputs like {@code "thththth..."} where one bigram appears
+     * hundreds of times.
+     */
+    public static final double MIN_DIVERSITY_RATIO = 0.02;
+
+    /**
+     * Minimum scored bigrams required before the diversity gate
+     * applies.  Short probes legitimately have lower diversity ratios
+     * (fewer total bigrams = fewer opportunities for distinct ones)
+     * and shouldn't be gated as degenerate.  Above this floor, the
+     * ratio measurement is meaningful.
+     */
+    public static final int MIN_BIGRAMS_FOR_DIVERSITY_GATE = 100;
 
     private final String[] labels;
     /** Charset objects cached at load — one {@code Charset.forName} per 
class, ever. */
@@ -263,45 +309,8 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
      * tiny probes that can't be scored.
      */
     public double[] scoreClasses(byte[] probe) {
-        if (probe == null || probe.length < 2) {
-            return null;
-        }
-        int len = Math.min(probe.length, MAX_PROBE_BYTES);
-
-        // Integer hot loop — CharSoup-style.  int8 logP × int8 IDF →
-        // int16 product, accumulated into int32 per class.  Overflow
-        // safety: at MAX_PROBE_BYTES=16384, max 16383 bigrams × 127 × 127
-        // ≈ 264M per class, well inside int32's 2.1B headroom.
-        int[] dots = new int[numClasses];
-        for (int i = 0; i + 1 < len; i++) {
-            int b0 = probe[i] & 0xFF;
-            int b1 = probe[i + 1] & 0xFF;
-            // γ: bigrams where both bytes are ASCII whitespace carry no
-            // encoding signal, and per-class training-data preparation
-            // varies in how it handles consecutive whitespace (GB18030's
-            // training collapsed it; others retained it).  That asymmetry
-            // can dominate scoring on HTML-stripped probes where
-            // whitespace bigrams are the highest-frequency tokens.  Skip.
-            if (isWhitespace(b0) && isWhitespace(b1)) {
-                continue;
-            }
-            int bigram = (b0 << 8) | b1;
-            int w = idf8[bigram];  // non-negative, 0..127
-            if (w == 0) {
-                continue; // bigram has no discriminative power; skip
-            }
-            int base = bigram * numClasses;
-            for (int c = 0; c < numClasses; c++) {
-                dots[c] += logP8[base + c] * w;
-            }
-        }
-
-        // Single per-class dequantization at end of probe.
-        double[] score = new double[numClasses];
-        for (int c = 0; c < numClasses; c++) {
-            score[c] = dots[c] * perClassDequant[c];
-        }
-        return score;
+        ScoreResult sr = scoreClassesAndCount(probe);
+        return sr == null ? null : sr.scores;
     }
 
     /**
@@ -371,7 +380,15 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
             return null;
         }
         int len = Math.min(probe.length, MAX_PROBE_BYTES);
-        int[] dots = new int[numClasses];
+
+        // Pass 1: count distinct bigrams.  Whitespace and zero-IDF
+        // bigrams are skipped as in the original hot loop.  short[] is
+        // enough since count fits in 16383 (max possible).  Track the
+        // ids of distinct bigrams in a parallel array so pass 2 doesn't
+        // need to scan the full 65k space.
+        short[] count = new short[BIGRAM_SPACE];
+        int[] distinctBigrams = new int[len];
+        int distinctIdx = 0;
         int scored = 0;
         int total = 0;
         for (int i = 0; i + 1 < len; i++) {
@@ -387,14 +404,75 @@ public class NaiveBayesBigramEncodingDetector implements 
EncodingDetector {
                 continue;
             }
             scored++;
-            int base = bigram * numClasses;
-            for (int c = 0; c < numClasses; c++) {
-                dots[c] += logP8[base + c] * w;
+            if (count[bigram] == 0) {
+                distinctBigrams[distinctIdx++] = bigram;
             }
+            count[bigram]++;
         }
+
+        // Type A — diversity gate.  If the input has too few distinct
+        // bigrams relative to total scored bigrams, it's a degenerate
+        // / looped input ("thththth..." or worse).  Abstain — caller
+        // falls back.  Only applied above a minimum scored-bigrams
+        // floor, since short probes legitimately have lower diversity
+        // ratios.
+        if (scored >= MIN_BIGRAMS_FOR_DIVERSITY_GATE
+                && (double) distinctIdx / scored < MIN_DIVERSITY_RATIO) {
+            return null;
+        }
+
+        // Type C — per-bigram total-contribution cap.  Only applies
+        // when we have enough distinct bigrams that capping any single
+        // one won't destroy a large fraction of the discriminative
+        // signal.  Below the floor, short-probe semantics rule: every
+        // bigram counts fully.
+        boolean applyCap = distinctIdx >= MIN_DISTINCT_FOR_CAP;
+
+        // Pass 2: per distinct bigram, compute per-class total
+        // contribution and (when above floor) apply Type C cap.
         double[] score = new double[numClasses];
-        for (int c = 0; c < numClasses; c++) {
-            score[c] = dots[c] * perClassDequant[c];
+        double[] contributions = new double[numClasses];
+        for (int k = 0; k < distinctIdx; k++) {
+            int bigram = distinctBigrams[k];
+            int n = count[bigram];
+            int w = idf8[bigram];
+            double countTimesIdf = (double) n * w;
+            int base = bigram * numClasses;
+
+            if (!applyCap) {
+                // Fast path: no cap, just accumulate.
+                for (int c = 0; c < numClasses; c++) {
+                    score[c] += logP8[base + c] * countTimesIdf * 
perClassDequant[c];
+                }
+                continue;
+            }
+
+            // logPs are negative; "best" class for the bigram = highest
+            // (least negative) contribution after dequant.
+            double max = Double.NEGATIVE_INFINITY;
+            double secondMax = Double.NEGATIVE_INFINITY;
+            for (int c = 0; c < numClasses; c++) {
+                double contrib = logP8[base + c] * countTimesIdf * 
perClassDequant[c];
+                contributions[c] = contrib;
+                if (contrib > max) {
+                    secondMax = max;
+                    max = contrib;
+                } else if (contrib > secondMax) {
+                    secondMax = contrib;
+                }
+            }
+            // Cap any class whose contribution exceeds runner-up + cap.
+            double cap = secondMax + CAP_PER_BIGRAM_NATS;
+            if (max > cap) {
+                for (int c = 0; c < numClasses; c++) {
+                    if (contributions[c] > cap) {
+                        contributions[c] = cap;
+                    }
+                }
+            }
+            for (int c = 0; c < numClasses; c++) {
+                score[c] += contributions[c];
+            }
         }
         return new ScoreResult(score, scored, total);
     }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
index c71475ebaa..bcfce41d67 100644
Binary files 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 and 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
 differ
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
new file mode 100644
index 0000000000..28b027f47c
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/HtmlByteStripperTest.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.nio.charset.StandardCharsets;
+
+import org.junit.jupiter.api.Test;
+
+public class HtmlByteStripperTest {
+
+    /** Helper: strip a string and return the (text, tagCount, entityCount) 
tuple. */
+    private static StripOutcome strip(String input) {
+        byte[] src = input.getBytes(StandardCharsets.US_ASCII);
+        byte[] dst = new byte[src.length];
+        HtmlByteStripper.Result r = HtmlByteStripper.strip(src, 0, src.length, 
dst, 0);
+        return new StripOutcome(new String(dst, 0, r.length, 
StandardCharsets.US_ASCII),
+                r.tagCount, r.entityCount);
+    }
+
+    private static final class StripOutcome {
+        final String text;
+        final int tagCount;
+        final int entityCount;
+        StripOutcome(String text, int tagCount, int entityCount) {
+            this.text = text;
+            this.tagCount = tagCount;
+            this.entityCount = entityCount;
+        }
+    }
+
+    @Test
+    public void namedEntityIsStripped() {
+        StripOutcome r = strip("hello &amp; world");
+        assertEquals("hello  world", r.text);
+        assertEquals(1, r.entityCount);
+        assertEquals(0, r.tagCount);
+    }
+
+    @Test
+    public void decimalNumericEntityIsStripped() {
+        StripOutcome r = strip("foo &#169; bar");
+        assertEquals("foo  bar", r.text);
+        assertEquals(1, r.entityCount);
+    }
+
+    @Test
+    public void hexNumericEntityIsStripped() {
+        StripOutcome r = strip("foo &#x00A9; bar");
+        assertEquals("foo  bar", r.text);
+        assertEquals(1, r.entityCount);
+    }
+
+    @Test
+    public void hexNumericEntityUppercaseXIsStripped() {
+        StripOutcome r = strip("a&#XA9;b");
+        assertEquals("ab", r.text);
+        assertEquals(1, r.entityCount);
+    }
+
+    @Test
+    public void ampersandFollowedByLetterWithoutSemicolonIsLiteral() {
+        // AT&T pattern: & followed by letter(s) but no closing ';'
+        StripOutcome r = strip("AT&T Inc");
+        assertEquals("AT&T Inc", r.text);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void ampersandFollowedByNonLetterIsLiteral() {
+        // Q&A pattern: & followed by uppercase letter then space → bailout
+        StripOutcome r = strip("Q&A session");
+        assertEquals("Q&A session", r.text);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void ampersandFollowedBySpaceIsLiteral() {
+        StripOutcome r = strip("a & b");
+        assertEquals("a & b", r.text);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void ampersandAtEndOfInputIsLiteral() {
+        StripOutcome r = strip("end&");
+        assertEquals("end&", r.text);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void unclosedEntityNameAtEndOfInputIsLiteral() {
+        StripOutcome r = strip("end&foo");
+        assertEquals("end&foo", r.text);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void unclosedNumericEntityAtEndOfInputIsLiteral() {
+        StripOutcome r = strip("end&#123");
+        assertEquals("end&#123", r.text);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void entityExceedingLengthCapIsLiteral() {
+        // Standard HTML5 entity longer than the 16-byte cap.
+        StripOutcome r = strip("x&CounterClockwiseContourIntegral;y");
+        // The cap kicks in mid-body; the consumed prefix is emitted as
+        // literal text, then the rest of the bytes follow as text.
+        // Exact prefix depends on cap; key assertion is the entity was
+        // NOT counted as stripped.
+        assertEquals(0, r.entityCount);
+        // The full input is preserved as text (cap bailout emits what
+        // it consumed, and the remaining tail follows naturally).
+        assertEquals("x&CounterClockwiseContourIntegral;y", r.text);
+    }
+
+    @Test
+    public void adjacentEntitiesAreAllStripped() {
+        StripOutcome r = strip("&amp;&amp;&amp;");
+        assertEquals("", r.text);
+        assertEquals(3, r.entityCount);
+    }
+
+    @Test
+    public void ampersandCascadingIntoTagWorks() {
+        // & followed by letters then '<' should emit the bailout prefix
+        // and then transition into tag-stripping.
+        StripOutcome r = strip("a&foo<b>c");
+        assertEquals("a&fooc", r.text);
+        assertEquals(0, r.entityCount);
+        assertEquals(1, r.tagCount);
+    }
+
+    @Test
+    public void ampersandCascadingIntoAnotherEntity() {
+        // & followed by non-entity content then another &amp; — the
+        // first '&' should emit literal, second '&' starts a new entity.
+        StripOutcome r = strip("a&!&amp;b");
+        assertEquals("a&!b", r.text);
+        assertEquals(1, r.entityCount);
+    }
+
+    @Test
+    public void tagWithEntityInBodyStripsBoth() {
+        StripOutcome r = strip("<p>hello&nbsp;world</p>");
+        assertEquals("helloworld", r.text);
+        assertEquals(2, r.tagCount);
+        assertEquals(1, r.entityCount);
+    }
+
+    @Test
+    public void plainTextNoMarkup() {
+        StripOutcome r = strip("just plain text, no markup at all");
+        assertEquals("just plain text, no markup at all", r.text);
+        assertEquals(0, r.tagCount);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void emptyEntityIsLiteral() {
+        // "&;" — & followed immediately by ';' (not letter / not '#')
+        StripOutcome r = strip("a&;b");
+        assertEquals("a&;b", r.text);
+        assertEquals(0, r.entityCount);
+    }
+
+    @Test
+    public void numericEmptyBodyIsLiteral() {
+        // "&#;" — &# followed by ';' (not 'x' / not digit)
+        StripOutcome r = strip("a&#;b");
+        assertEquals("a&#;b", r.text);
+        assertEquals(0, r.entityCount);
+    }
+}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java
index db9346eb90..99017e7a3f 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/TraceMojibuster.java
@@ -21,6 +21,7 @@ import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
@@ -39,6 +40,10 @@ public final class TraceMojibuster {
     public static void main(String[] args) throws Exception {
         Path probeDir = null;
         String[] probes = null;
+        Path inlineFile = null;
+        byte[] inlineBytes = null;
+        boolean fullRanking = false;
+        int rankingTopN = 25;
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
                 case "--probe-dir":
@@ -48,13 +53,30 @@ public final class TraceMojibuster {
                 case "--probes":
                     probes = args[++i].split(",");
                     break;
+                case "--file":
+                    inlineFile = Paths.get(args[++i].replaceFirst("^~",
+                            System.getProperty("user.home")));
+                    break;
+                case "--bytes-hex":
+                    inlineBytes = decodeHex(args[++i]);
+                    break;
+                case "--full-ranking":
+                    fullRanking = true;
+                    break;
+                case "--ranking-top-n":
+                    rankingTopN = Integer.parseInt(args[++i]);
+                    break;
                 default:
                     System.err.println("Unknown arg: " + args[i]);
                     System.exit(1);
             }
         }
-        if (probeDir == null || probes == null) {
-            System.err.println("Usage: TraceMojibuster --probe-dir <dir> 
--probes p1,p2,...");
+        boolean hasInline = inlineFile != null || inlineBytes != null;
+        boolean hasProbeList = probeDir != null && probes != null;
+        if (!hasInline && !hasProbeList) {
+            System.err.println("Usage: TraceMojibuster"
+                    + " (--probe-dir <dir> --probes p1,p2,... | --file <path> 
| --bytes-hex <hex>)"
+                    + " [--full-ranking] [--ranking-top-n N]");
             System.exit(1);
         }
         // Load the bundled model from the classpath (same path Mojibuster 
uses).
@@ -67,33 +89,62 @@ public final class TraceMojibuster {
         }
         MojibusterEncodingDetector det = new MojibusterEncodingDetector();
 
-        for (String pid : probes) {
-            Path p = probeDir.resolve(pid);
-            if (!Files.exists(p)) {
-                System.err.println("Missing: " + p);
-                continue;
+        // Build the (label, bytes) work list.  Inline probes (--file /
+        // --bytes-hex) are processed first; then any --probes from
+        // --probe-dir.
+        List<ProbeInput> work = new ArrayList<>();
+        if (inlineFile != null) {
+            if (!Files.exists(inlineFile)) {
+                System.err.println("Missing: " + inlineFile);
+                System.exit(1);
             }
-            byte[] bytes = Files.readAllBytes(p);
-            String shortId = pid.contains("/")
-                    ? pid.substring(pid.indexOf('/') + 1, pid.indexOf('/') + 
13) : pid;
+            work.add(new ProbeInput(inlineFile.getFileName().toString(),
+                    Files.readAllBytes(inlineFile)));
+        }
+        if (inlineBytes != null) {
+            work.add(new ProbeInput("inline-hex(" + inlineBytes.length + "B)",
+                    inlineBytes));
+        }
+        if (probes != null) {
+            for (String pid : probes) {
+                Path p = probeDir.resolve(pid);
+                if (!Files.exists(p)) {
+                    System.err.println("Missing: " + p);
+                    continue;
+                }
+                String shortId = pid.contains("/")
+                        ? pid.substring(pid.indexOf('/') + 1, pid.indexOf('/') 
+ 13) : pid;
+                work.add(new ProbeInput(shortId, Files.readAllBytes(p)));
+            }
+        }
+
+        for (ProbeInput w : work) {
+            byte[] bytes = w.bytes;
             System.out.println();
-            System.out.println("==== " + shortId + "  raw=" + bytes.length + " 
bytes ====");
+            System.out.println("==== " + w.label + "  raw=" + bytes.length + " 
bytes ====");
 
             // Layer 1: raw NB on raw bytes (no strip).
             List<EncodingResult> rawResults = rawNb.detect(bytes);
             System.out.println("  raw NB (no strip):       " + 
fmt(rawResults));
+            if (fullRanking) {
+                dumpFullRanking(rawNb, bytes, "raw", rankingTopN);
+            }
 
             // Layer 2: NB on HTML-stripped bytes.
             byte[] dst = new byte[bytes.length];
             HtmlByteStripper.Result sr = HtmlByteStripper.strip(bytes, 0, 
bytes.length, dst, 0);
+            byte[] strippedView = null;
             if (sr.tagCount >= 1) {
-                byte[] stripped = new byte[sr.length];
-                System.arraycopy(dst, 0, stripped, 0, sr.length);
+                strippedView = new byte[sr.length];
+                System.arraycopy(dst, 0, strippedView, 0, sr.length);
                 System.out.printf(Locale.ROOT,
                         "  HTML strip: tags=%d, post-strip=%d bytes (%.1f%% 
kept)%n",
                         sr.tagCount, sr.length, 100.0 * sr.length / 
bytes.length);
-                List<EncodingResult> stripResults = rawNb.detect(stripped);
+                List<EncodingResult> stripResults = rawNb.detect(strippedView);
                 System.out.println("  NB on stripped bytes:    " + 
fmt(stripResults));
+                if (fullRanking) {
+                    dumpFullRanking(rawNb, strippedView, "strip", rankingTopN);
+                }
             } else {
                 System.out.println("  HTML strip: tagCount=0 (backoff, used 
original)");
             }
@@ -104,6 +155,69 @@ public final class TraceMojibuster {
         }
     }
 
+    /**
+     * Print every class sorted by raw NB log-score for this probe.
+     * Shows where the true charset actually ranks before margin gating,
+     * gap-from-top-1 in nats, and gap-per-scored-bigram (the unit the
+     * margin gate uses).  An "emit?" column flags which candidates would
+     * pass the {@link 
NaiveBayesBigramEncodingDetector#MARGIN_THRESHOLD_NATS_PER_BIGRAM}
+     * gate.
+     */
+    private static void dumpFullRanking(NaiveBayesBigramEncodingDetector nb,
+                                        byte[] probe, String layer, int topN) {
+        NaiveBayesBigramEncodingDetector.ScoreResult sr = 
nb.scoreClassesAndCount(probe);
+        if (sr == null) {
+            System.out.println("    [full-ranking " + layer + "]  <probe too 
short to score>");
+            return;
+        }
+        String[] labels = nb.getLabels();
+        int n = labels.length;
+        Integer[] idx = new Integer[n];
+        for (int i = 0; i < n; i++) idx[i] = i;
+        final double[] scores = sr.scores;
+        java.util.Arrays.sort(idx, (a, b) -> Double.compare(scores[b], 
scores[a]));
+        double top1 = scores[idx[0]];
+        double marginNats = 
NaiveBayesBigramEncodingDetector.MARGIN_THRESHOLD_NATS_PER_BIGRAM
+                * Math.max(1, sr.scoredBigrams);
+        System.out.printf(Locale.ROOT,
+                "    [full-ranking %s]  scoredBigrams=%d totalBigrams=%d  
marginGate=%.3f nats (%.3f×bg)%n",
+                layer, sr.scoredBigrams, sr.totalBigrams, marginNats,
+                
NaiveBayesBigramEncodingDetector.MARGIN_THRESHOLD_NATS_PER_BIGRAM);
+        int limit = Math.min(topN, n);
+        for (int rank = 0; rank < limit; rank++) {
+            int c = idx[rank];
+            double score = scores[c];
+            double gap = top1 - score;
+            double gapPerBg = (sr.scoredBigrams > 0) ? gap / sr.scoredBigrams 
: Double.NaN;
+            String emit = (rank == 0) ? "top1"
+                    : (gap < marginNats ? "EMIT" : "----");
+            System.out.printf(Locale.ROOT,
+                    "      #%2d  %-18s  score=%+11.3f  gap=%+8.3f  
gap/bg=%+.4f  %s%n",
+                    rank + 1, labels[c], score, gap, gapPerBg, emit);
+        }
+    }
+
+    private static byte[] decodeHex(String s) {
+        String cleaned = s.replaceAll("[\\s,:]", "");
+        if (cleaned.length() % 2 != 0) {
+            throw new IllegalArgumentException("hex string must have even 
length: " + s);
+        }
+        byte[] out = new byte[cleaned.length() / 2];
+        for (int i = 0; i < out.length; i++) {
+            out[i] = (byte) Integer.parseInt(cleaned.substring(2 * i, 2 * i + 
2), 16);
+        }
+        return out;
+    }
+
+    private static final class ProbeInput {
+        final String label;
+        final byte[] bytes;
+        ProbeInput(String label, byte[] bytes) {
+            this.label = label;
+            this.bytes = bytes;
+        }
+    }
+
     private static String fmt(List<EncodingResult> rs) {
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < rs.size(); i++) {
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
index 43eecdc5da..a082f4c5be 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
@@ -37,6 +37,8 @@ import java.util.Set;
 import java.util.TreeMap;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.tika.ml.chardetect.HtmlByteStripper;
+
 /**
  * Naive-Bayes byte-bigram charset classifier trainer.
  *
@@ -80,6 +82,10 @@ public class TrainNaiveBayesBigram {
 
     private static final double DEFAULT_ALPHA_BASE = 1.0;
     private static final int DEFAULT_MAX_SAMPLES = 50_000;
+    /** Match {@code MojibusterEncodingDetector.MIN_TAG_COUNT_TO_USE_STRIP}. */
+    private static final int MIN_TAG_COUNT_TO_USE_STRIP = 1;
+    /** Match {@code 
MojibusterEncodingDetector.MIN_ENTITY_COUNT_TO_USE_STRIP}. */
+    private static final int MIN_ENTITY_COUNT_TO_USE_STRIP = 3;
     /**
      * Default per-class vocabulary coverage.  0.999 means the top-K
      * most frequent bigrams covering 99.9% of the class's marginal
@@ -192,7 +198,7 @@ public class TrainNaiveBayesBigram {
         }
 
         System.out.printf(Locale.ROOT,
-                "coverage=%.3f  alpha-base=%.3f  max-samples/class=%,d%n",
+                "coverage=%.3f  alpha-base=%.3f  max-samples/class=%,d  
(markup-stripped via HtmlByteStripper)%n",
                 coverage, alphaBase, maxSamples);
         System.out.println("Classes (" + classFilter.size() + "): "
                 + new java.util.TreeSet<>(classFilter));
@@ -244,8 +250,10 @@ public class TrainNaiveBayesBigram {
             // across contributing files so a 2-file class (UTF-16 LE+BE)
             // doesn't overrun a 1-file class's sample count.
             int perFileBudget = Math.max(1, maxSamples / 
entry.getValue().size());
+            long samplesStripped = 0;
             for (Path f : entry.getValue()) {
                 int fileSamples = 0;
+                byte[] stripBuf = new byte[65536];
                 try (InputStream fis = new FileInputStream(f.toFile());
                      GZIPInputStream gis = new GZIPInputStream(fis);
                      DataInputStream dis = new DataInputStream(gis)) {
@@ -258,8 +266,30 @@ public class TrainNaiveBayesBigram {
                         }
                         byte[] sample = new byte[len];
                         dis.readFully(sample);
-                        for (int i = 0; i + 1 < sample.length; i++) {
-                            int bigram = ((sample[i] & 0xFF) << 8) | (sample[i 
+ 1] & 0xFF);
+                        // Train/inference symmetry: same HtmlByteStripper +
+                        // same gate as MojibusterEncodingDetector at
+                        // detect-time.  Hardcoded ON to prevent accidental
+                        // misconfiguration; the bigram tables MUST reflect
+                        // the byte distribution NB will actually score.
+                        if (stripBuf.length < len) {
+                            stripBuf = new byte[len];
+                        }
+                        HtmlByteStripper.Result r = HtmlByteStripper.strip(
+                                sample, 0, len, stripBuf, 0);
+                        byte[] scoreBytes;
+                        int scoreLen;
+                        if (r.tagCount >= MIN_TAG_COUNT_TO_USE_STRIP
+                                || r.entityCount >= 
MIN_ENTITY_COUNT_TO_USE_STRIP) {
+                            scoreBytes = stripBuf;
+                            scoreLen = r.length;
+                            samplesStripped++;
+                        } else {
+                            scoreBytes = sample;
+                            scoreLen = len;
+                        }
+                        for (int i = 0; i + 1 < scoreLen; i++) {
+                            int bigram = ((scoreBytes[i] & 0xFF) << 8)
+                                    | (scoreBytes[i + 1] & 0xFF);
                             counts[bigram]++;
                             totalBigrams++;
                         }
@@ -271,10 +301,11 @@ public class TrainNaiveBayesBigram {
             countsPerClass[ci] = counts;
             totalsPerClass[ci] = totalBigrams;
             System.out.printf(Locale.ROOT,
-                    "  counted %-20s  %,7d samples  %,10d total bigrams  (%d 
file%s)%n",
+                    "  counted %-20s  %,7d samples  %,10d total bigrams  (%d 
file%s)  stripped=%,d%n",
                     labels[ci], numSamples, totalBigrams,
                     entry.getValue().size(),
-                    entry.getValue().size() == 1 ? "" : "s");
+                    entry.getValue().size() == 1 ? "" : "s",
+                    samplesStripped);
             ci++;
         }

(tika) 01/09: TIKA-4731 - strip html entities, rebalance training, add modest clipping

Reply via email to