This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e0d4a6d7f3 remove bestMatch (#2785)
e0d4a6d7f3 is described below
commit e0d4a6d7f34e6b58894ee3cc2ea49dc090255f4c
Author: Tim Allison <[email protected]>
AuthorDate: Thu Apr 23 21:17:05 2026 -0400
remove bestMatch (#2785)
---
.../charsoup/CharSoupEncodingDetector.java | 48 +++++++++++++++++-----
.../charsoup/GenerativeLanguageModel.java | 48 ++++++++--------------
2 files changed, 53 insertions(+), 43 deletions(-)
diff --git
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
index e9fc080c29..fdaa52d889 100644
---
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -290,13 +290,32 @@ public class CharSoupEncodingDetector implements
MetaEncodingDetector {
}
/**
- * Generative-model tiebreaker: for each candidate charset's decoded text,
- * detect the most likely language then compute its z-score. The charset
- * producing the highest z-score (closest to "real language") wins,
provided
- * it exceeds {@link #MIN_GENERATIVE_ZSCORE}.
+ * Generative-model tiebreaker: for each candidate charset's decoded
+ * text, let the discriminative language classifier pick the most
+ * likely language, then ask the generative model how natural the
+ * decoded text is UNDER THAT LANGUAGE. The charset producing the
+ * highest length-adjusted z-score wins, provided it exceeds
+ * {@link #MIN_GENERATIVE_ZSCORE}.
*
- * @return the winning charset, or {@code null} if no candidate passes the
- * threshold or all candidates decode to identical text
+ * <p>The chaining matters. An earlier revision used
+ * {@code GLM.bestMatch(text)} to pick the language, which compares
+ * raw scores across all ~200 languages. That comparison is
+ * unreliable because out-of-class inputs can produce hash-collision
+ * scores that exceed in-class scores for another language — the
+ * classic pathology where real Chinese prose ranks at position 20
+ * under zho while Sakizaya, Amis, Min-Dong-romanization, and other
+ * unrelated languages rank ahead of it. The GLM's raw scores were
+ * never meant for across-language comparison. The discriminative
+ * classifier was trained explicitly for that job and is far more
+ * reliable at picking the language. Once a language is picked, the
+ * GLM's per-language calibrated z-score answers the question it
+ * actually was designed for: "is this decoded text natural text in
+ * language X?"
+ *
+ * @return the winning charset, or {@code null} if no candidate
+ * passes the z-score threshold, no candidate yields a
+ * discriminative language prediction, or all candidates
+ * decode to identical text
*/
private static <K> K generativeTiebreak(Map<K, String> candidates) {
if (candidates.isEmpty()) {
@@ -323,13 +342,20 @@ public class CharSoupEncodingDetector implements
MetaEncodingDetector {
if (CharSoupLanguageDetector.junkRatio(text) > 0.10f) {
continue;
}
- Map.Entry<String, Float> match = GLM.bestMatch(text);
- if (match == null) {
+ // Chain: discriminative classifier picks the language,
+ // GLM scores under that language (not bestMatch across all langs).
+ List<String> topLangs =
+ CharSoupLanguageDetector.topShortTextLanguages(text, 1);
+ if (topLangs.isEmpty()) {
+ continue;
+ }
+ String discLang = topLangs.get(0);
+ if (discLang == null || discLang.isEmpty()) {
continue;
}
- float z = GLM.zScoreLengthAdjusted(text, match.getKey());
- LOG.debug("generativeTiebreak: {} -> lang={} z={}",
- entry.getKey(), match.getKey(), z);
+ float z = GLM.zScoreLengthAdjusted(text, discLang);
+ LOG.debug("generativeTiebreak: {} -> discLang={} z={}",
+ entry.getKey(), discLang, z);
if (!Float.isNaN(z) && z > bestZ) {
bestZ = z;
bestKey = entry.getKey();
diff --git
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
index 8a2c75d54f..b6810e2749 100644
---
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
+++
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
@@ -310,38 +310,22 @@ public class GenerativeLanguageModel {
}
}
- /**
- * Score {@code text} against all languages and return the best match.
- */
- public Map.Entry<String, Float> bestMatch(String text) {
- String best = null;
- float bestScore = Float.NEGATIVE_INFINITY;
- for (String lang : langIds) {
- float s = score(text, lang);
- if (!Float.isNaN(s) && s > bestScore) {
- bestScore = s;
- best = lang;
- }
- }
- return best == null ? null : Map.entry(best, bestScore);
- }
-
- /**
- * Average raw score of {@code text} across all CJK languages in the model.
- */
- public float avgCjkScore(String text) {
- double sum = 0;
- int count = 0;
- for (int i = 0; i < langIds.size(); i++) {
- if (!isCjk[i]) continue;
- float s = score(text, langIds.get(i));
- if (!Float.isNaN(s)) {
- sum += s;
- count++;
- }
- }
- return count == 0 ? Float.NaN : (float) (sum / count);
- }
+ // Cross-language score comparison was removed (formerly bestMatch /
+ // avgCjkScore). The per-class raw scores are not comparable across
+ // languages: each language's per-bucket log-probabilities are
+ // normalised by that language's own training-corpus total, so
+ // small-corpus languages produce systematically higher per-bucket
+ // log-probs than large-corpus languages on out-of-class input.
+ // In practice a real Chinese probe scored as {@code zho} would return
+ // a raw score around -8 while scoring the same probe against a small
+ // minor-language model would return a raw score around -3 to -5 just
+ // from the smoothing-denominator artefact. The GLM is designed to
+ // answer "given it is language X, how natural is this text as
+ // language X?" — not "which language is this?". Callers that need
+ // to pick a language first must do so with the discriminative
+ // classifier (CharSoupLanguageDetector), then pass that language
+ // explicitly to {@link #score}, {@link #zScore}, or
+ // {@link #zScoreLengthAdjusted}.
// ---- Z-score API ----