(tika) 05/06: through step 6

tallison Fri, 17 Apr 2026 10:59:40 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch charset-ship-today
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 5b2e867ef59073893a7f9422f229d04d8a92ea8d
Author: tallison <[email protected]>
AuthorDate: Fri Apr 17 09:35:21 2026 -0400

    through step 6
---
 .../tika/langdetect/charsoup/HtmlStripper.java     |  82 +++++++++++-----
 .../tika/langdetect/charsoup/HtmlStripperTest.java |  62 ++++++++----
 .../ml/chardetect/MojibusterEncodingDetector.java  |  32 +++++-
 .../ml/chardetect/tools/TrainCharsetModel.java     | 108 ++++++++++++++-------
 4 files changed, 207 insertions(+), 77 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java
 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java
index f36aa635c1..fd6ef5f78a 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/HtmlStripper.java
@@ -17,16 +17,16 @@
 package org.apache.tika.langdetect.charsoup;
 
 /**
- * HTML/XML markup stripping tuned for language scoring.  Not a full HTML
- * parser — purpose-built to feed character-bigram language detectors a
- * markup-free string that still carries the page's content language.
+ * HTML/XML markup stripping tuned for language scoring and charset
+ * disambiguation.  Not a full HTML parser — purpose-built to feed
+ * character-bigram language detectors a markup-free string that still
+ * carries the page's content language.
  *
  * <p>Real-world HTML probes are routinely 95-99% markup by byte count.
  * Without this pass, a language detector sees the markup as its primary
  * input — which on any HTML page looks predominantly like ASCII English
- * regardless of the page's actual content language.  Stripping markup
- * (and decoding numeric entities, which can carry content) lets the
- * detector see the actual content.
+ * regardless of the page's actual content language.  Stripping the markup
+ * lets the detector see the actual content.
  *
  * <h3>What it does, in one linear pass</h3>
  * <ul>
@@ -37,15 +37,23 @@ package org.apache.tika.langdetect.charsoup;
  *   <li>Removes {@code <!-- ... -->} comments.</li>
  *   <li>Removes {@code <...>} tag markup (element names, attribute names,
  *       attribute values).</li>
- *   <li><em>Decodes</em> numeric character references ({@code &#1234;},
- *       {@code &#xABCD;}) to their actual code points — these can carry
- *       the page's primary content (e.g. Korean-charset pages that emit
- *       simplified-Chinese-only ideographs via numeric entities for
- *       cross-charset compatibility).</li>
  *   <li>Replaces named entity references ({@code &amp;}, {@code &nbsp;},
  *       {@code &copy;}) with a space — these are nearly always
  *       punctuation/typography with low language signal, and a full
  *       named-entity table would be heavyweight.</li>
+ *   <li>Default ({@link #strip(String)}): <strong>drops numeric character
+ *       references</strong> ({@code &#1234;}, {@code &#xABCD;}) to a single
+ *       space, on the grounds that a single numeric-entity-heavy section
+ *       can expand to a very different byte distribution than the raw
+ *       probe we are trying to characterise — at charset-detection time
+ *       we want to score the raw bytes, not a synthetic Unicode rendering
+ *       of them.</li>
+ *   <li>Opt-in ({@link #stripAndDecodeNumeric(String)}): <em>decodes</em>
+ *       numeric character references to their actual code points.  Useful
+ *       where numeric entities carry the page's primary content (e.g.
+ *       pages that emit CJK ideographs via {@code &#NNNN;} for
+ *       cross-charset compatibility, so the decoded content reaches a
+ *       downstream language scorer).</li>
  * </ul>
  *
  * <h3>What it doesn't do</h3>
@@ -68,14 +76,34 @@ public final class HtmlStripper {
     }
 
     /**
-     * Strip markup from {@code text} and return the content with numeric
-     * entities decoded.  See class javadoc for details.
+     * Strip markup from {@code text}.  Numeric character references are
+     * dropped to a space — same treatment as named entities.  See class
+     * javadoc for details.  Use {@link #stripAndDecodeNumeric(String)} when
+     * a caller specifically needs numeric entities decoded.
      *
      * @param text input string (HTML/XML or plain text); {@code null} or empty
      *             returns the input unchanged
-     * @return content with markup removed and numeric entities decoded
+     * @return content with markup removed and entity references dropped to 
space
      */
     public static String strip(String text) {
+        return strip(text, false);
+    }
+
+    /**
+     * Strip markup from {@code text}, decoding numeric character references
+     * to their actual code points.  Use when numeric entities carry content
+     * the downstream consumer needs to see (e.g. language scoring on pages
+     * that emit CJK ideographs as {@code &#NNNN;}).
+     *
+     * @param text input string (HTML/XML or plain text); {@code null} or empty
+     *             returns the input unchanged
+     * @return content with markup removed and numeric entities decoded
+     */
+    public static String stripAndDecodeNumeric(String text) {
+        return strip(text, true);
+    }
+
+    private static String strip(String text, boolean decodeNumericEntities) {
         if (text == null || text.isEmpty()) {
             return text;
         }
@@ -87,7 +115,7 @@ public final class HtmlStripper {
             if (c == '<') {
                 i = handleOpenAngle(text, i, n, out);
             } else if (c == '&') {
-                i = handleAmpersand(text, i, n, out);
+                i = handleAmpersand(text, i, n, out, decodeNumericEntities);
             } else {
                 out.append(c);
                 i++;
@@ -114,8 +142,15 @@ public final class HtmlStripper {
         return end < 0 ? n : end + 1;
     }
 
-    /** Handle a {@code &} — numeric entity (decode), named entity (drop), or 
literal. */
-    private static int handleAmpersand(String s, int i, int n, StringBuilder 
out) {
+    /**
+     * Handle a {@code &} — numeric entity, named entity, or literal.  When
+     * {@code decodeNumericEntities} is {@code true}, valid numeric entities
+     * are decoded to their Unicode code point; otherwise they are dropped
+     * to a space, same as named entities.  An unparseable numeric entity is
+     * always dropped to space (it's not literal text even in no-decode mode).
+     */
+    private static int handleAmpersand(String s, int i, int n, StringBuilder 
out,
+                                       boolean decodeNumericEntities) {
         // Look for ; within a small window — entity references are short.
         int max = Math.min(n, i + 12);
         int semi = -1;
@@ -135,12 +170,15 @@ public final class HtmlStripper {
         }
         // Numeric entity?
         if (semi >= i + 3 && s.charAt(i + 1) == '#') {
-            int cp = parseNumericEntity(s, i + 2, semi);
-            if (cp >= 0) {
-                appendCodePointSafe(out, cp);
-                return semi + 1;
+            if (decodeNumericEntities) {
+                int cp = parseNumericEntity(s, i + 2, semi);
+                if (cp >= 0) {
+                    appendCodePointSafe(out, cp);
+                    return semi + 1;
+                }
             }
-            // Unparseable numeric entity — treat as space (it's not literal 
text).
+            // Default (no-decode) path, or unparseable numeric in decode mode:
+            // drop to a space — numeric entities are not literal text.
             out.append(' ');
             return semi + 1;
         }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java
index b21b2c8a40..8c5f816042 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/HtmlStripperTest.java
@@ -63,30 +63,41 @@ public class HtmlStripperTest {
     }
 
     @Test
-    public void handlesEntities() {
-        // Named entities (e.g. &amp;, &nbsp;) → stripped to space (low signal,
-        // and a full named-entity table is heavyweight).
-        // Numeric entities (e.g. &#1234;, &#x201D;) → DECODED to their actual
-        // code point so the content reaches the language detector.  This
-        // matters for files where the page's primary content is delivered
-        // via numeric entities (e.g. industrial-product pages emitting CJK
-        // ideographs as &#NNNN; for cross-charset compatibility).
+    public void handlesEntitiesDefault() {
+        // Default strip(): both named and numeric entities are dropped to a
+        // space.  Numeric decode is opt-in via stripAndDecodeNumeric(); the
+        // default target is charset detection on raw bytes, where a big
+        // numeric-entity expansion would distort what we're measuring.
         String stripped = HtmlStripper.strip(
                 "<p>&amp;hello&nbsp;world&#8211;test&#x201D;end</p>");
         assertFalse(stripped.contains("&"),
                 "No entity references should survive: " + stripped);
-        // 0x2013 = en-dash, 0x201D = right double quote — should appear as
-        // actual chars, not as entity references nor as spaces.
+        assertFalse(stripped.contains("\u2013"),
+                "Default strip must NOT decode numeric entities: " + stripped);
+        assertFalse(stripped.contains("\u201D"),
+                "Default strip must NOT decode numeric entities: " + stripped);
+        assertTrue(stripped.contains("hello"));
+        assertTrue(stripped.contains("world"));
+    }
+
+    @Test
+    public void decodeVariantDecodesEntities() {
+        // stripAndDecodeNumeric() preserves the legacy behaviour: named
+        // entities → space, numeric entities → actual code point.  Kept for
+        // callers that need the content behind numeric entities (e.g.
+        // language scoring on pages that emit CJK ideographs as &#NNNN;).
+        String stripped = HtmlStripper.stripAndDecodeNumeric(
+                "<p>&amp;hello&nbsp;world&#8211;test&#x201D;end</p>");
+        assertFalse(stripped.contains("&"),
+                "No entity references should survive: " + stripped);
         assertTrue(stripped.contains("\u2013"),
                 "Numeric entity &#8211; should decode to en-dash: " + 
stripped);
         assertTrue(stripped.contains("\u201D"),
                 "Numeric entity &#x201D; should decode to right double quote: 
" + stripped);
-        assertTrue(stripped.contains("hello"));
-        assertTrue(stripped.contains("world"));
     }
 
     @Test
-    public void decodesCjkNumericEntities() {
+    public void decodeVariantDecodesCjkNumericEntities() {
         // Real-world case: industrial-product pages that emit CJK ideographs
         // via numeric entities (so they render correctly regardless of the
         // page's declared charset).  The decoded content must reach the
@@ -94,7 +105,7 @@ public class HtmlStripperTest {
         // ASCII markup and concludes "English" no matter what the page is
         // actually about.
         String input = "<p>&#36807;&#28388;&#31163; cyclone</p>";
-        String stripped = HtmlStripper.strip(input);
+        String stripped = HtmlStripper.stripAndDecodeNumeric(input);
         assertTrue(stripped.contains("\u8FC7"),
                 "0x8FC7 (过) should decode: " + stripped);
         assertTrue(stripped.contains("\u6EE4"),
@@ -104,11 +115,28 @@ public class HtmlStripperTest {
     }
 
     @Test
-    public void rejectsInvalidNumericEntities() {
+    public void defaultDropsCjkNumericEntitiesToSpaces() {
+        // The inverse of decodeVariantDecodesCjkNumericEntities: default
+        // strip() drops all numeric entities.  This is what we want for
+        // raw-byte charset-detection scoring — the CJK ideographs are not
+        // part of the probe we are characterising.
+        String input = "<p>&#36807;&#28388;&#31163; cyclone</p>";
+        String stripped = HtmlStripper.strip(input);
+        assertFalse(stripped.contains("\u8FC7"), "default must not decode: " + 
stripped);
+        assertFalse(stripped.contains("\u6EE4"), "default must not decode: " + 
stripped);
+        assertFalse(stripped.contains("\u79BB"), "default must not decode: " + 
stripped);
+        assertTrue(stripped.contains("cyclone"));
+    }
+
+    @Test
+    public void rejectsInvalidNumericEntitiesInDecodeVariant() {
         // Surrogate-half codepoints, control chars, and out-of-range numbers
         // should be replaced with a space rather than emitted (they would
-        // either crash the language detector or skew scores).
-        String stripped = 
HtmlStripper.strip("good&#xD800;bad&#0;bad&#9999999;good");
+        // either crash the language detector or skew scores).  Applies to
+        // the decode-numeric variant; the default already drops everything
+        // numeric to a space regardless of validity.
+        String stripped = HtmlStripper.stripAndDecodeNumeric(
+                "good&#xD800;bad&#0;bad&#9999999;good");
         assertFalse(stripped.contains("\uD800"),
                 "Surrogate code point should not be emitted: " + stripped);
         assertTrue(stripped.contains("good"));
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index c650284f53..23181ddb7e 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -108,15 +108,28 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
          */
         CRLF_TO_WINDOWS,
         /**
-         * When the top candidate is a single-byte Latin-family charset
-         * (see {@link CharsetConfusables#SBCS_LATIN_FAMILY}) other than
+         * On <strong>short probes only</strong>, when the top candidate is a
+         * single-byte Latin-family charset (see
+         * {@link CharsetConfusables#SBCS_LATIN_FAMILY}) other than
          * windows-1252, and the probe decodes byte-identically under
          * windows-1252, swap the result to windows-1252 as the unmarked
-         * Latin default.  Cheap per-probe byte walk via
+         * Latin default.
+         *
+         * <p>Short-probe gate: the rule only fires when
+         * {@code probe.length < SHORT_PROBE_THRESHOLD} (currently 50 bytes).
+         * On longer probes the model has seen enough high-byte evidence to
+         * discriminate sibling Latin code pages (windows-1250/1254/1257,
+         * ISO-8859-X) genuinely — rewriting to windows-1252 there would
+         * erase real distinctions.  On short probes the model is falling
+         * back to bias, which is where sparse-Latin vCard-style content
+         * false-positives as IBM424 / windows-1257 / x-MacRoman; this gate
+         * catches those.</p>
+         *
+         * <p>Per-probe byte walk via
          * {@link DecodeEquivalence#byteIdenticalOnProbe}; short-circuits on
          * the first disagreeing high byte.  Zero cost for probes whose top
          * candidate isn't Latin-family (CJK, UTF-*, EBCDIC, Cyrillic,
-         * Arabic, Greek, Hebrew).
+         * Arabic, Greek, Hebrew).</p>
          */
         LATIN_FALLBACK_WIN1252
     }
@@ -536,7 +549,16 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             results = selectAtLeast(model, logits, MIN_CANDIDATES, probe, 
grammar);
         }
 
-        if (enabledRules.contains(Rule.LATIN_FALLBACK_WIN1252)) {
+        // LATIN_FALLBACK_WIN1252 is gated to short probes only.  On long 
probes
+        // the model has enough high-byte evidence to discriminate sibling 
Latin
+        // code pages (windows-1250/1254/1257/ISO-8859-X) and we trust it;
+        // forcing a rewrite to windows-1252 would erase those distinctions.
+        // Short probes (< SHORT_PROBE_THRESHOLD bytes) are where the model
+        // falls back to bias — that's where the fallback prevents
+        // IBM424/windows-1257/x-MacRoman false positives on sparse-Latin
+        // vCard-style content.
+        if (enabledRules.contains(Rule.LATIN_FALLBACK_WIN1252)
+                && probe.length < SHORT_PROBE_THRESHOLD) {
             results = applyLatinFallback(probe, results);
         }
 
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
index d7379f4c8b..b46e89ea6c 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
@@ -70,36 +70,60 @@ public class TrainCharsetModel {
     private static final int DEFAULT_MAX_SAMPLES = 500_000;
 
     /**
-     * Labels excluded from the main SBCS "kitchen-sink" model by default.
+     * Labels the main SBCS "kitchen-sink" model is trained on today.
      *
-     * <p>Hardcoded here (rather than passed on the command line) so the 
model's
-     * class set is versioned in git alongside the code that uses it — past
-     * retraining runs with inconsistent CLI flags were a recurring source of
-     * mismatched inference/training feature sets.</p>
+     * <p>Include-list semantics (not exclude): {@link 
BuildCharsetTrainingData}
+     * generates training corpora for many more labels than these (EBCDIC
+     * nationals, DOS OEM, Mac charsets, extended ISO-8859 variants, etc.),
+     * pre-positioned for future specialists; today's SBCS consumes only the
+     * explicit set below.  Hardcoded here so the model's class set is
+     * versioned in git alongside the code that uses it — past retraining
+     * runs with inconsistent CLI flags were a recurring source of mismatched
+     * inference/training feature sets.</p>
      *
-     * <p>{@link BuildCharsetTrainingData} still generates training corpora for
-     * these labels — they are needed by future specialists (e.g. an EBCDIC
-     * specialist) — but the main SBCS model doesn't consume them today:</p>
+     * <p>Baseline is the v6 label set ({@code chardetect-v6-no-utf32.bin},
+     * 35 classes), with these changes:</p>
      * <ul>
-     *   <li><b>IBM424-ltr/rtl</b> (Hebrew EBCDIC) — content bytes occupy 
0x41–0x6A,
-     *       entirely below the 0x80 threshold the shipped
+     *   <li><b>Removed</b> {@code IBM424-ltr/rtl}, {@code IBM420-ltr/rtl}
+     *       (Hebrew/Arabic EBCDIC) — content bytes occupy {@code 0x41–0x6A},
+     *       entirely below the {@code 0x80} threshold the shipped
      *       {@link ByteNgramFeatureExtractor} considers.  Training on these
-     *       labels teaches weights the inference path cannot ever match.</li>
-     *   <li><b>IBM420-ltr/rtl</b> (Arabic EBCDIC) — same reason as 
IBM424.</li>
-     *   <li><b>IBM1047</b> (z/OS Unix System Services Latin-1) — 
byte-identical
-     *       to IBM500 on most prose; having both as classes just splits the
-     *       EBCDIC-Latin signal without adding discrimination the model can
-     *       use.</li>
+     *       labels teaches weights the inference path cannot match.</li>
+     *   <li><b>Removed</b> {@code IBM1047} — byte-identical to {@code IBM500}
+     *       on most prose; having both as classes splits the EBCDIC-Latin
+     *       signal without adding discrimination.</li>
+     *   <li><b>Removed</b> {@code UTF-16-LE} / {@code UTF-16-BE} — owned by
+     *       {@code Utf16SpecialistEncodingDetector}; no longer emitted as
+     *       main-model classes (same reasoning the v6 name
+     *       "{@code -no-utf32}" captures for UTF-32).</li>
+     *   <li><b>Added</b> {@code x-windows-949} — Korean MS949, strict
+     *       superset of EUC-KR; trained as a separate class so the model
+     *       can discriminate MS949-extension-byte content from pure
+     *       EUC-KR.</li>
      * </ul>
-     *
-     * <p>CLI {@code --exclude} is unioned with this set, not replaced, so an
-     * operator can add further exclusions but cannot accidentally suppress
-     * the hardcoded policy.</p>
      */
-    static final Set<String> TODAY_SBCS_EXCLUDE = Set.of(
-            "IBM424-ltr", "IBM424-rtl",
-            "IBM420-ltr", "IBM420-rtl",
-            "IBM1047");
+    static final Set<String> TODAY_SBCS_INCLUDE = Set.of(
+            // CJK (multi-byte)
+            "Big5-HKSCS", "EUC-JP", "EUC-KR", "x-windows-949",
+            "GB18030", "Shift_JIS", "x-EUC-TW",
+            // Unicode
+            "UTF-8",
+            // EBCDIC (international Latin only — other variants deferred to 
specialist)
+            "IBM500",
+            // DOS / OEM Latin (retained from v6)
+            "IBM850", "IBM852",
+            // Cyrillic
+            "IBM855", "IBM866", "KOI8-R", "KOI8-U",
+            "windows-1251", "x-mac-cyrillic",
+            // Windows single-byte
+            "windows-1250", "windows-1252", "windows-1253", "windows-1254",
+            "windows-1255", "windows-1256", "windows-1257", "windows-1258",
+            "windows-874",
+            // ISO-8859 (only the ones v6 kept as distinct labels; 1/2/4/9 fold
+            // into their windows-12XX supersets)
+            "ISO-8859-3", "ISO-8859-16",
+            // Mac
+            "x-MacRoman");
 
     public static void main(String[] args) throws IOException {
         Path dataDir = null;
@@ -111,9 +135,10 @@ public class TrainCharsetModel {
         // --label-remap src1:dst1,src2:dst2 — merges multiple source labels 
into
         // one target label at training time (e.g. merge script variants into 
one class).
         Map<String, String> labelRemap = new HashMap<>();
-        // Start from the hardcoded SBCS-kitchen-sink exclusion list; CLI
-        // --exclude adds to it but cannot override.
-        Set<String> excludeLabels = new 
java.util.HashSet<>(TODAY_SBCS_EXCLUDE);
+        // CLI --exclude adds extra labels to drop *on top of* the include-list
+        // policy (used for ablation experiments).  Cannot override the include
+        // list — labels not in the policy are excluded regardless.
+        Set<String> excludeLabels = new java.util.HashSet<>();
 
         for (int i = 0; i < args.length; i++) {
             switch (args[i]) {
@@ -164,27 +189,44 @@ public class TrainCharsetModel {
             System.err.println("  --max-samples-per-class N");
             System.err.println("  --label-remap src1:dst1,src2:dst2");
             System.err.println("                           merge source labels 
into a single target label");
-            System.err.println("  --exclude cs1,cs2          ADD these to the 
hardcoded exclusion list "
-                    + TODAY_SBCS_EXCLUDE);
+            System.err.println("  --exclude cs1,cs2          drop these 
additionally on top of the hardcoded "
+                    + "include list (" + TODAY_SBCS_INCLUDE.size() + " classes 
in TODAY_SBCS_INCLUDE)");
             System.exit(1);
         }
 
-        // Discover charset files
+        // Discover charset files.  Include-list policy: only labels in
+        // TODAY_SBCS_INCLUDE are admitted, regardless of what files exist in
+        // dataDir (which may contain future-specialist corpora — Mac, DOS
+        // OEM, EBCDIC nationals, etc.).  CLI --exclude can drop further
+        // labels for ablation.
         List<Path> charsetFiles = Files.list(dataDir)
                 .filter(p -> p.getFileName().toString().endsWith(".bin.gz"))
                 .filter(p -> {
                     String cs = 
p.getFileName().toString().replaceAll("\\.bin\\.gz$", "");
-                    return !excludeLabels.contains(cs);
+                    return TODAY_SBCS_INCLUDE.contains(cs) && 
!excludeLabels.contains(cs);
                 })
                 .sorted()
                 .collect(Collectors.toList());
 
+        System.out.println("TODAY_SBCS_INCLUDE (" + TODAY_SBCS_INCLUDE.size() 
+ " classes): "
+                + new java.util.TreeSet<>(TODAY_SBCS_INCLUDE));
         if (!excludeLabels.isEmpty()) {
-            System.out.println("Excluded labels: " + excludeLabels);
+            System.out.println("Additional CLI --exclude: " + excludeLabels);
+        }
+        // Report any include-list classes that had no matching file on disk.
+        java.util.Set<String> foundLabels = charsetFiles.stream()
+                .map(p -> 
p.getFileName().toString().replaceAll("\\.bin\\.gz$", ""))
+                .collect(Collectors.toCollection(java.util.TreeSet::new));
+        java.util.Set<String> missing = new 
java.util.TreeSet<>(TODAY_SBCS_INCLUDE);
+        missing.removeAll(foundLabels);
+        missing.removeAll(excludeLabels);
+        if (!missing.isEmpty()) {
+            System.err.println("WARNING: include-list classes with no data 
file in "
+                    + dataDir + ": " + missing);
         }
 
         if (charsetFiles.isEmpty()) {
-            System.err.println("No .bin.gz files found in: " + dataDir);
+            System.err.println("No matching .bin.gz files found in: " + 
dataDir);
             System.exit(1);
         }

(tika) 05/06: through step 6

Reply via email to