(tika) 01/01: drop chunking

tallison Thu, 28 May 2026 11:34:03 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch langdetect-charsoup-no-chunking
in repository https://gitbox.apache.org/repos/asf/tika.git


commit a4406673fcfa8601216f8e8637ee64fce1c699a3
Author: tallison <[email protected]>
AuthorDate: Thu May 28 14:27:58 2026 -0400

    drop chunking
---
 .../charsoup/CharSoupLanguageDetector.java         | 101 +++++++--------------
 1 file changed, 31 insertions(+), 70 deletions(-)

diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
index 7742efd7d9..2859971ddc 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
@@ -41,23 +41,20 @@ import org.apache.tika.language.detect.LanguageResult;
  * trained on Wikipedia (primary corpus) with MADLAD supplements for thin 
languages.
  * <p>
  * Text is buffered via {@link #addText(char[], int, int)} up to
- * {@link CharSoupFeatureExtractor#MAX_TEXT_LENGTH} characters. At {@link 
#detectAll()} time,
- * the buffer is evaluated in independent {@value #CHUNK_SIZE}-character 
chunks.
- * Each chunk runs the full preprocessing pipeline (truncate → strip 
URLs/emails →
- * NFC normalize → extract bigram features → score via raw logits). If the 
first
- * chunk produces high entropy (indicating junk, code, or non-language 
content),
- * the next chunk is tried. The result from the chunk with the lowest entropy
- * is returned. This avoids polluting the language signal with leading junk 
while
- * keeping the implementation simple and predictable.
+ * {@link CharSoupFeatureExtractor#MAX_TEXT_LENGTH} characters (configurable
+ * via {@link #setMaxLength(int)}). At {@link #detectAll()} the entire buffer
+ * is fed through the full preprocessing pipeline (strip URLs/emails →
+ * NFC normalize → extract bigram features) and scored once. The verdict
+ * is the argmax of the (group-collapsed) logits over the whole input.
  * </p>
  * <p>
  * Inference uses raw logits throughout — no softmax distribution is ever 
computed.
-     * Confidence is based on the <em>margin</em> between the top two logits 
after
-     * confusable-group collapsing: {@code sigmoid(top_logit − second_logit)}.
-     * This is invariant to the number of classes and provides a stable 
confidence
-     * signal from short snippets up to full documents. Per-class {@code 
rawScore}
-     * is {@code sigmoid(logit_c − best_competitor_logit)}: the winner gets a 
value
-     * above 0.5, all others below.
+ * Confidence is based on the <em>margin</em> between the top two logits after
+ * confusable-group collapsing: {@code sigmoid(top_logit − second_logit)}.
+ * This is invariant to the number of classes and provides a stable confidence
+ * signal from short snippets up to full documents. Per-class {@code rawScore}
+ * is {@code sigmoid(logit_c − best_competitor_logit)}: the winner gets a value
+ * above 0.5, all others below.
  * </p>
  */
 @TikaComponent(name = "charsoup-language-detector")
@@ -73,36 +70,13 @@ public class CharSoupLanguageDetector extends 
LanguageDetector implements SelfCo
     private static final String MODEL_RESOURCE =
             "/org/apache/tika/langdetect/charsoup/langdetect-20260320.bin";
 
-    /**
-     * Size (in chars) of each independent chunk evaluated during detection.
-     * If the first chunk yields high entropy (junk/code), the next chunk
-     * is tried, and so on, until a confident result is found or the buffer
-     * is exhausted. Each chunk is preprocessed and evaluated independently
-     * so that junk in one chunk does not pollute the signal in the next.
-     */
-    private static final int CHUNK_SIZE = 5_000;
-
     /**
      * Buffer length at which {@link #hasEnoughText()} returns true.
-     * One chunk is more than sufficient for reliable language detection;
-     * this is set to two chunks so the detector has a fallback if the
-     * first chunk is junk.
-     */
-    private static final int ENOUGH_TEXT_LENGTH = CHUNK_SIZE * 2;
-
-    /**
-     * Maximum entropy (in bits) for a chunk to be considered "confident
-     * enough" to return. If a chunk's collapsed-distribution entropy
-     * exceeds this threshold, the detector moves on to the next chunk.
-     * <p>
-     * Typical values:
-     * <ul>
-     *   <li>&lt; 1.0 — clean, single-language text</li>
-     *   <li>1.0–3.0 — confusable language or short text</li>
-     *   <li>&gt; 3.5 — likely junk (code, OCR garbage, binary, etc.)</li>
-     * </ul>
+     * 10,000 characters is comfortably above the saturation point where
+     * the bigram-NB model has full discriminative signal on typical
+     * prose; streaming callers can stop feeding once this is reached.
      */
-    private static final float ENTROPY_THRESHOLD = 3.5f;
+    private static final int ENOUGH_TEXT_LENGTH = 10_000;
 
     /**
      * Confusable language groups — languages within the same group are nearly
@@ -721,40 +695,27 @@ public class CharSoupLanguageDetector extends 
LanguageDetector implements SelfCo
     @Override
     public List<LanguageResult> detectAll() {
         String text = buffer.toString();
-        if (text.isEmpty()) {
+        // Cheap empty/whitespace-only short-circuit so callers see the
+        // explicit NULL result instead of model-bias logits computed from
+        // an empty feature vector.
+        if (text.isBlank()) {
             lastEntropy = Float.NaN;
             return Collections.singletonList(LanguageResult.NULL);
         }
 
-        int len = text.length();
-        float[] bestLogits = null;
-        float bestEntropy = Float.MAX_VALUE;
-        String bestChunk = null;
+        // Single full-buffer extraction.  The feature extractor is
+        // whitespace-invariant (only letter-letter / sentinel-letter /
+        // letter-sentinel bigrams are emitted) and bounded internally
+        // at MAX_TEXT_LENGTH; the caller's buffer is already bounded
+        // by setMaxLength(...) so the work here is linear in
+        // min(buffer.length, MAX_TEXT_LENGTH).
         int[] features = new int[extractor.getNumBuckets()];
-
-        for (int start = 0; start < len; start += CHUNK_SIZE) {
-            int end = Math.min(start + CHUNK_SIZE, len);
-            String chunk = text.substring(start, end);
-
-            extractor.extractAndCount(chunk, features);
-            float[] logits = model.predictLogits(features);
-            logits = applyScriptGate(logits, chunk, classScript);
-            float[] collapsed = collapseGroups(logits, groupIndices);
-
-            float entropy = entropyFromLogits(collapsed);
-
-            if (entropy < bestEntropy) {
-                bestEntropy = entropy;
-                bestLogits = collapsed;
-                bestChunk = chunk;
-            }
-
-            if (entropy < ENTROPY_THRESHOLD) {
-                break;
-            }
-        }
-
-        return buildResults(bestLogits, bestEntropy);
+        extractor.extractAndCount(text, features);
+        float[] logits = model.predictLogits(features);
+        logits = applyScriptGate(logits, text, classScript);
+        float[] collapsed = collapseGroups(logits, groupIndices);
+        float entropy = entropyFromLogits(collapsed);
+        return buildResults(collapsed, entropy);
     }
 
     /**

(tika) 01/01: drop chunking

Reply via email to