This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch charset-cjk-cc-mirror in repository https://gitbox.apache.org/repos/asf/tika.git
commit c6b5d7f6d5720c4f21106e3ab00ec665a1faad10 Author: tallison <[email protected]> AuthorDate: Fri May 29 15:31:57 2026 -0400 dynamic top-n selector --- .../ml/junkdetect/JunkFilterEncodingDetector.java | 104 ++++++++++++++++----- 1 file changed, 82 insertions(+), 22 deletions(-) diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java index 056c768a65..b7810158ea 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java @@ -18,9 +18,13 @@ package org.apache.tika.ml.junkdetect; import java.io.IOException; import java.nio.charset.Charset; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -80,6 +84,16 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { * anchor instead of arbitrating near-identical decodes by quality. */ private static final float NO_INFO_CONFIDENCE = 0.1f; + // Adaptive candidate band (TIKA speed lever). The tournament only needs + // NB's top-2 statistical candidates plus any lower-ranked candidate still + // within MIN_TAIL_CONFIDENCE of the top; deeper, low-confidence candidates + // are clearly dominated and almost never win (measured: δ=0.5 retains + // ~98-99% of selected winners, ~20% smaller pool). Anchors (DECLARATIVE, + // STRUCTURAL) are always kept regardless of confidence. Quality impact is + // validated by a full common-token/OOV eval, NOT assumed. + private static final int ALWAYS_KEEP_TOP_N = 2; + private static final float MIN_TAIL_CONFIDENCE = 0.5f; + /** Cached quality detector. {@code null} if none is on the classpath. */ private final TextQualityDetector qualityDetector; @@ -152,24 +166,25 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { // become codepoints whose cross-script transitions expose mojibake // under a wrong decoding (AIT5 case). Map<Charset, String> candidates = new LinkedHashMap<>(); - for (Charset cs : uniqueCharsets) { - String decoded = safeDecode(bytes, cs); - if (decoded != null && !decoded.isEmpty()) { - decoded = HtmlContentCleaner.clean(decoded); + // Dedup: charsets that decode the raw probe to the identical string + // (e.g. GB18030/GBK, x-windows-949/EUC-KR on non-extension content) + // share one clean() call — the cleaned result is identical by + // construction, so this is quality-neutral, purely a work saving. + Map<String, String> cleanedByRaw = new HashMap<>(); + Set<Charset> candidateCharsets = bandFilter(context, uniqueCharsets); + for (Charset cs : candidateCharsets) { + String raw = safeDecode(bytes, cs); + if (raw == null || raw.isEmpty()) { + LOG.trace("junk-filter decode {} -> null/empty", cs.name()); + continue; + } + String decoded = cleanedByRaw.get(raw); + if (decoded == null) { + decoded = HtmlContentCleaner.clean(raw); + cleanedByRaw.put(raw, decoded); } if (decoded != null && !decoded.isEmpty()) { candidates.put(cs, decoded); - if (LOG.isTraceEnabled()) { - int sampleLen = Math.min(400, decoded.length()); - String sample = decoded.substring(0, sampleLen) - .replace('\n', ' ').replace('\r', ' '); - LOG.trace("junk-filter decoded {}: '{}{}' (len={})", - cs.name(), sample, - decoded.length() > sampleLen ? "…" : "", - decoded.length()); - } - } else { - LOG.trace("junk-filter decode {} -> null/empty", cs.name()); } } if (candidates.size() <= 1) { @@ -228,15 +243,20 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { Charset champion = null; double championZ = Double.NEGATIVE_INFINITY; Map<Charset, Double> scoreByCharset = new LinkedHashMap<>(); + // Dedup: identical decoded text scores identically; score it once. + Map<String, Float> zByText = new HashMap<>(); for (Map.Entry<Charset, String> entry : candidates.entrySet()) { - org.apache.tika.quality.TextQualityScore sc = - qualityDetector.score(entry.getValue()); - float rawZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : sc.getZScore(); + String text = entry.getValue(); + Float cached = zByText.get(text); + float rawZ; + if (cached != null) { + rawZ = cached; + } else { + org.apache.tika.quality.TextQualityScore sc = qualityDetector.score(text); + rawZ = sc.isUnknown() ? Float.NEGATIVE_INFINITY : sc.getZScore(); + zByText.put(text, rawZ); + } scoreByCharset.put(entry.getKey(), (double) rawZ); - LOG.trace("junk-filter score {} z={} script={}", - entry.getKey().name(), - String.format(java.util.Locale.ROOT, "%.3f", rawZ), - sc.isUnknown() ? "UNKNOWN" : sc.getDominantScript()); if (rawZ > championZ) { championZ = rawZ; champion = entry.getKey(); @@ -274,6 +294,46 @@ public class JunkFilterEncodingDetector implements MetaEncodingDetector { return List.of(new EncodingResult(champion, confidence)); } + /** + * Restrict the candidate set the tournament will decode+clean+score: keep + * every DECLARATIVE/STRUCTURAL anchor (author intent / byte-grammar proof), + * plus the top {@link #ALWAYS_KEEP_TOP_N} STATISTICAL candidates by + * confidence, plus any deeper STATISTICAL candidate still within + * {@link #MIN_TAIL_CONFIDENCE}. Drops the dominated low-confidence tail — + * the speed lever — without removing any anchor or NB's real contenders. + * Returns a subset of {@code all}, preserving its iteration order. + */ + private static Set<Charset> bandFilter(EncodingDetectorContext context, Set<Charset> all) { + Set<Charset> anchors = new HashSet<>(); + List<EncodingResult> stats = new ArrayList<>(); + for (EncodingDetectorContext.Result r : context.getResults()) { + for (EncodingResult er : r.getEncodingResults()) { + EncodingResult.ResultType t = er.getResultType(); + if (t == EncodingResult.ResultType.DECLARATIVE + || t == EncodingResult.ResultType.STRUCTURAL) { + anchors.add(er.getCharset()); + } else if (t == EncodingResult.ResultType.STATISTICAL) { + stats.add(er); + } + } + } + stats.sort((a, b) -> Float.compare(b.getConfidence(), a.getConfidence())); + Set<Charset> keepStat = new HashSet<>(); + for (int i = 0; i < stats.size(); i++) { + if (i < ALWAYS_KEEP_TOP_N + || stats.get(i).getConfidence() >= MIN_TAIL_CONFIDENCE) { + keepStat.add(stats.get(i).getCharset()); + } + } + Set<Charset> kept = new LinkedHashSet<>(); + for (Charset cs : all) { + if (anchors.contains(cs) || keepStat.contains(cs)) { + kept.add(cs); + } + } + return kept; + } + /** * True if some detector produced a confident non-declarative signal: any * STRUCTURAL result (byte-grammar proof), or any STATISTICAL result above
