This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch rm-best-match-from-glm in repository https://gitbox.apache.org/repos/asf/tika.git
commit 40297eed20ba14c707a6ef87454678fe9eb543d0 Author: tallison <[email protected]> AuthorDate: Wed Apr 22 14:58:09 2026 -0400 remove bestMatch --- .../charsoup/CharSoupEncodingDetector.java | 48 +++++++++++++++++----- .../charsoup/GenerativeLanguageModel.java | 48 ++++++++-------------- 2 files changed, 53 insertions(+), 43 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java index e9fc080c29..fdaa52d889 100644 --- a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java @@ -290,13 +290,32 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { } /** - * Generative-model tiebreaker: for each candidate charset's decoded text, - * detect the most likely language then compute its z-score. The charset - * producing the highest z-score (closest to "real language") wins, provided - * it exceeds {@link #MIN_GENERATIVE_ZSCORE}. + * Generative-model tiebreaker: for each candidate charset's decoded + * text, let the discriminative language classifier pick the most + * likely language, then ask the generative model how natural the + * decoded text is UNDER THAT LANGUAGE. The charset producing the + * highest length-adjusted z-score wins, provided it exceeds + * {@link #MIN_GENERATIVE_ZSCORE}. * - * @return the winning charset, or {@code null} if no candidate passes the - * threshold or all candidates decode to identical text + * <p>The chaining matters. An earlier revision used + * {@code GLM.bestMatch(text)} to pick the language, which compares + * raw scores across all ~200 languages. That comparison is + * unreliable because out-of-class inputs can produce hash-collision + * scores that exceed in-class scores for another language — the + * classic pathology where real Chinese prose ranks at position 20 + * under zho while Sakizaya, Amis, Min-Dong-romanization, and other + * unrelated languages rank ahead of it. The GLM's raw scores were + * never meant for across-language comparison. The discriminative + * classifier was trained explicitly for that job and is far more + * reliable at picking the language. Once a language is picked, the + * GLM's per-language calibrated z-score answers the question it + * actually was designed for: "is this decoded text natural text in + * language X?" + * + * @return the winning charset, or {@code null} if no candidate + * passes the z-score threshold, no candidate yields a + * discriminative language prediction, or all candidates + * decode to identical text */ private static <K> K generativeTiebreak(Map<K, String> candidates) { if (candidates.isEmpty()) { @@ -323,13 +342,20 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { if (CharSoupLanguageDetector.junkRatio(text) > 0.10f) { continue; } - Map.Entry<String, Float> match = GLM.bestMatch(text); - if (match == null) { + // Chain: discriminative classifier picks the language, + // GLM scores under that language (not bestMatch across all langs). + List<String> topLangs = + CharSoupLanguageDetector.topShortTextLanguages(text, 1); + if (topLangs.isEmpty()) { + continue; + } + String discLang = topLangs.get(0); + if (discLang == null || discLang.isEmpty()) { continue; } - float z = GLM.zScoreLengthAdjusted(text, match.getKey()); - LOG.debug("generativeTiebreak: {} -> lang={} z={}", - entry.getKey(), match.getKey(), z); + float z = GLM.zScoreLengthAdjusted(text, discLang); + LOG.debug("generativeTiebreak: {} -> discLang={} z={}", + entry.getKey(), discLang, z); if (!Float.isNaN(z) && z > bestZ) { bestZ = z; bestKey = entry.getKey(); diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java index 8a2c75d54f..b6810e2749 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java @@ -310,38 +310,22 @@ public class GenerativeLanguageModel { } } - /** - * Score {@code text} against all languages and return the best match. - */ - public Map.Entry<String, Float> bestMatch(String text) { - String best = null; - float bestScore = Float.NEGATIVE_INFINITY; - for (String lang : langIds) { - float s = score(text, lang); - if (!Float.isNaN(s) && s > bestScore) { - bestScore = s; - best = lang; - } - } - return best == null ? null : Map.entry(best, bestScore); - } - - /** - * Average raw score of {@code text} across all CJK languages in the model. - */ - public float avgCjkScore(String text) { - double sum = 0; - int count = 0; - for (int i = 0; i < langIds.size(); i++) { - if (!isCjk[i]) continue; - float s = score(text, langIds.get(i)); - if (!Float.isNaN(s)) { - sum += s; - count++; - } - } - return count == 0 ? Float.NaN : (float) (sum / count); - } + // Cross-language score comparison was removed (formerly bestMatch / + // avgCjkScore). The per-class raw scores are not comparable across + // languages: each language's per-bucket log-probabilities are + // normalised by that language's own training-corpus total, so + // small-corpus languages produce systematically higher per-bucket + // log-probs than large-corpus languages on out-of-class input. + // In practice a real Chinese probe scored as {@code zho} would return + // a raw score around -8 while scoring the same probe against a small + // minor-language model would return a raw score around -3 to -5 just + // from the smoothing-denominator artefact. The GLM is designed to + // answer "given it is language X, how natural is this text as + // language X?" — not "which language is this?". Callers that need + // to pick a language first must do so with the discriminative + // classifier (CharSoupLanguageDetector), then pass that language + // explicitly to {@link #score}, {@link #zScore}, or + // {@link #zScoreLengthAdjusted}. // ---- Z-score API ----
