(tika) 01/01: remove bestMatch

tallison Wed, 22 Apr 2026 11:58:32 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch rm-best-match-from-glm
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 40297eed20ba14c707a6ef87454678fe9eb543d0
Author: tallison <[email protected]>
AuthorDate: Wed Apr 22 14:58:09 2026 -0400

    remove bestMatch
---
 .../charsoup/CharSoupEncodingDetector.java         | 48 +++++++++++++++++-----
 .../charsoup/GenerativeLanguageModel.java          | 48 ++++++++--------------
 2 files changed, 53 insertions(+), 43 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
index e9fc080c29..fdaa52d889 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -290,13 +290,32 @@ public class CharSoupEncodingDetector implements 
MetaEncodingDetector {
     }
 
     /**
-     * Generative-model tiebreaker: for each candidate charset's decoded text,
-     * detect the most likely language then compute its z-score. The charset
-     * producing the highest z-score (closest to "real language") wins, 
provided
-     * it exceeds {@link #MIN_GENERATIVE_ZSCORE}.
+     * Generative-model tiebreaker: for each candidate charset's decoded
+     * text, let the discriminative language classifier pick the most
+     * likely language, then ask the generative model how natural the
+     * decoded text is UNDER THAT LANGUAGE. The charset producing the
+     * highest length-adjusted z-score wins, provided it exceeds
+     * {@link #MIN_GENERATIVE_ZSCORE}.
      *
-     * @return the winning charset, or {@code null} if no candidate passes the
-     *         threshold or all candidates decode to identical text
+     * <p>The chaining matters. An earlier revision used
+     * {@code GLM.bestMatch(text)} to pick the language, which compares
+     * raw scores across all ~200 languages. That comparison is
+     * unreliable because out-of-class inputs can produce hash-collision
+     * scores that exceed in-class scores for another language — the
+     * classic pathology where real Chinese prose ranks at position 20
+     * under zho while Sakizaya, Amis, Min-Dong-romanization, and other
+     * unrelated languages rank ahead of it. The GLM's raw scores were
+     * never meant for across-language comparison. The discriminative
+     * classifier was trained explicitly for that job and is far more
+     * reliable at picking the language. Once a language is picked, the
+     * GLM's per-language calibrated z-score answers the question it
+     * actually was designed for: "is this decoded text natural text in
+     * language X?"
+     *
+     * @return the winning charset, or {@code null} if no candidate
+     *         passes the z-score threshold, no candidate yields a
+     *         discriminative language prediction, or all candidates
+     *         decode to identical text
      */
     private static <K> K generativeTiebreak(Map<K, String> candidates) {
         if (candidates.isEmpty()) {
@@ -323,13 +342,20 @@ public class CharSoupEncodingDetector implements 
MetaEncodingDetector {
             if (CharSoupLanguageDetector.junkRatio(text) > 0.10f) {
                 continue;
             }
-            Map.Entry<String, Float> match = GLM.bestMatch(text);
-            if (match == null) {
+            // Chain: discriminative classifier picks the language,
+            // GLM scores under that language (not bestMatch across all langs).
+            List<String> topLangs =
+                    CharSoupLanguageDetector.topShortTextLanguages(text, 1);
+            if (topLangs.isEmpty()) {
+                continue;
+            }
+            String discLang = topLangs.get(0);
+            if (discLang == null || discLang.isEmpty()) {
                 continue;
             }
-            float z = GLM.zScoreLengthAdjusted(text, match.getKey());
-            LOG.debug("generativeTiebreak: {} -> lang={} z={}",
-                    entry.getKey(), match.getKey(), z);
+            float z = GLM.zScoreLengthAdjusted(text, discLang);
+            LOG.debug("generativeTiebreak: {} -> discLang={} z={}",
+                    entry.getKey(), discLang, z);
             if (!Float.isNaN(z) && z > bestZ) {
                 bestZ = z;
                 bestKey = entry.getKey();
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
index 8a2c75d54f..b6810e2749 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
@@ -310,38 +310,22 @@ public class GenerativeLanguageModel {
         }
     }
 
-    /**
-     * Score {@code text} against all languages and return the best match.
-     */
-    public Map.Entry<String, Float> bestMatch(String text) {
-        String best = null;
-        float  bestScore = Float.NEGATIVE_INFINITY;
-        for (String lang : langIds) {
-            float s = score(text, lang);
-            if (!Float.isNaN(s) && s > bestScore) {
-                bestScore = s;
-                best = lang;
-            }
-        }
-        return best == null ? null : Map.entry(best, bestScore);
-    }
-
-    /**
-     * Average raw score of {@code text} across all CJK languages in the model.
-     */
-    public float avgCjkScore(String text) {
-        double sum = 0;
-        int count = 0;
-        for (int i = 0; i < langIds.size(); i++) {
-            if (!isCjk[i]) continue;
-            float s = score(text, langIds.get(i));
-            if (!Float.isNaN(s)) {
-                sum += s;
-                count++;
-            }
-        }
-        return count == 0 ? Float.NaN : (float) (sum / count);
-    }
+    // Cross-language score comparison was removed (formerly bestMatch /
+    // avgCjkScore).  The per-class raw scores are not comparable across
+    // languages: each language's per-bucket log-probabilities are
+    // normalised by that language's own training-corpus total, so
+    // small-corpus languages produce systematically higher per-bucket
+    // log-probs than large-corpus languages on out-of-class input.
+    // In practice a real Chinese probe scored as {@code zho} would return
+    // a raw score around -8 while scoring the same probe against a small
+    // minor-language model would return a raw score around -3 to -5 just
+    // from the smoothing-denominator artefact.  The GLM is designed to
+    // answer "given it is language X, how natural is this text as
+    // language X?" — not "which language is this?".  Callers that need
+    // to pick a language first must do so with the discriminative
+    // classifier (CharSoupLanguageDetector), then pass that language
+    // explicitly to {@link #score}, {@link #zScore}, or
+    // {@link #zScoreLengthAdjusted}.
 
     // ---- Z-score API ----

(tika) 01/01: remove bestMatch

Reply via email to