This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 66a83d3441 charset and junk tweaks (#2794)
66a83d3441 is described below
commit 66a83d3441767ab8f613787d629deabc674124bd
Author: Tim Allison <[email protected]>
AuthorDate: Wed Apr 29 16:27:01 2026 -0400
charset and junk tweaks (#2794)
---
.../tika/detect/CompositeEncodingDetector.java | 31 +++-
.../tika/ml/chardetect/HtmlByteStripper.java | 16 ++
.../ml/chardetect/MojibusterEncodingDetector.java | 102 ++++++++++-
.../ml/chardetect/StructuralEncodingRules.java | 108 ++++++++++++
.../ml/junkdetect/JunkFilterEncodingDetector.java | 186 ++++++++++++++++++++-
.../apache/tika/ml/junkdetect/EntityRefProbe.java | 164 ++++++++++++++++++
6 files changed, 598 insertions(+), 9 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index fc8b0ab038..ab554d5d7b 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -109,18 +109,22 @@ public class CompositeEncodingDetector implements
EncodingDetector, Serializable
ParseContext parseContext)
throws IOException {
for (EncodingDetector detector : getDetectors()) {
+ String name = detector.getClass().getSimpleName();
+ LOG.trace("chardet enter {}", name);
List<EncodingResult> results = detector.detect(tis, metadata,
parseContext);
if (!results.isEmpty()) {
Charset detected = results.get(0).getCharset();
+ LOG.trace("chardet hit {} -> {} [{}] conf={}", name,
detected.name(),
+ results.get(0).getResultType(),
results.get(0).getConfidence());
metadata.set(TikaCoreProperties.DETECTED_ENCODING,
detected.name());
- if (!detector.getClass().getSimpleName()
- .equals("CompositeEncodingDetector")) {
- metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
- detector.getClass().getSimpleName());
+ if (!name.equals("CompositeEncodingDetector")) {
+ metadata.set(TikaCoreProperties.ENCODING_DETECTOR, name);
}
return results;
}
+ LOG.trace("chardet miss {} (empty)", name);
}
+ LOG.trace("chardet no detector returned a result");
return Collections.emptyList();
}
@@ -142,16 +146,35 @@ public class CompositeEncodingDetector implements
EncodingDetector, Serializable
try {
for (EncodingDetector detector : baseDetectors) {
+ String name = detector.getClass().getSimpleName();
+ LOG.trace("chardet enter {}", name);
List<EncodingResult> detected = detector.detect(tis, metadata,
parseContext);
if (!detected.isEmpty()) {
+ LOG.trace("chardet emit {} -> {} [{}] conf={}", name,
+ detected.get(0).getCharset().name(),
+ detected.get(0).getResultType(),
+ detected.get(0).getConfidence());
context.addResult(detected,
detector.getClass().getSimpleName());
+ } else {
+ LOG.trace("chardet miss {} (empty)", name);
}
}
// Each base detector handles its own mark/reset, so the stream is
// back at the start here. CharSoup handles its own mark/reset too.
+ String metaName = metaDetector.getClass().getSimpleName();
+ LOG.trace("chardet meta enter {} (candidates={})", metaName,
+ context.getUniqueCharsets());
List<EncodingResult> metaResults =
metaDetector.detect(tis, metadata, parseContext);
+ if (!metaResults.isEmpty()) {
+ LOG.trace("chardet meta {} -> {} (arbitration={})", metaName,
+ metaResults.get(0).getCharset().name(),
+ context.getArbitrationInfo());
+ } else {
+ LOG.trace("chardet meta {} abstained (arbitration={})",
metaName,
+ context.getArbitrationInfo());
+ }
List<EncodingResult> finalResults;
String detectorName;
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
index babd931e7e..1443ae9723 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
@@ -61,6 +61,11 @@ public final class HtmlByteStripper {
private static final int RAW_BODY = 7;
private static final int ATTR_NAME = 8;
private static final int ATTR_AFTER_EQUALS = 9;
+ /** Inside a markup declaration like {@code <!DOCTYPE html ...>} or
+ * a processing instruction like {@code <?xml version="1.0"?>}. Both
+ * end at the next {@code >}. Internal subsets ({@code <!DOCTYPE foo [
... ]>})
+ * are rare; we'd stop at the first nested {@code >}. Acceptable. */
+ private static final int DECL_OR_PI = 10;
private static final byte[] SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
private static final byte[] STYLE = {'s', 't', 'y', 'l', 'e'};
@@ -158,6 +163,11 @@ public final class HtmlByteStripper {
state = COMMENT;
tagCount++;
i += 2;
+ } else if (b == '!' || b == '?') {
+ // <!DOCTYPE ...>, <!ATTLIST ...>, <?xml ...?>, etc.
+ // Consume bytes up to the next '>'.
+ state = DECL_OR_PI;
+ tagCount++;
} else if (b == '/' || isAsciiLetter(b)) {
state = TAG_NAME;
nameStart = i;
@@ -173,6 +183,12 @@ public final class HtmlByteStripper {
}
break;
+ case DECL_OR_PI:
+ if (b == '>') {
+ state = TEXT;
+ }
+ break;
+
case TAG_NAME:
if (isTagNameTerminator(b)) {
int nameLen = i - nameStart;
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index d1746b3781..ca6b71c0ff 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -25,6 +25,8 @@ import java.util.List;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.EncodingDetector;
@@ -67,6 +69,9 @@ import org.apache.tika.parser.ParseContext;
@TikaComponent(name = "mojibuster-encoding-detector")
public class MojibusterEncodingDetector implements EncodingDetector {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(MojibusterEncodingDetector.class);
+
/** Default NB bigram model on the classpath. */
public static final String DEFAULT_MODEL_RESOURCE =
"/org/apache/tika/ml/chardetect/nb-bigram.bin";
@@ -121,6 +126,24 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
/** Confidence for the windows-1252 fallback emitted on empty/ASCII
probes. */
private static final float FALLBACK_CONFIDENCE = 0.1f;
+ /**
+ * Maximum fraction of malformed-UTF-8 bytes we tolerate before
+ * disqualifying NB's UTF-8 pick. Real-world UTF-8 files often contain
+ * one or two corrupted bytes (copy-paste accidents, truncation,
+ * transport flips) — rejecting them outright would force the detector
+ * to drop a high-confidence UTF-8 classification on otherwise-valid
+ * text and fall through to {@code AutoDetectReader.detect}, which
+ * raises {@code TikaException} when the chain returns no candidates.
+ * 0.5% (1 byte per 200) accommodates "tiny corruption" while still
+ * rejecting genuinely-non-UTF-8 streams (which would have many more
+ * malformed bytes).
+ *
+ * <p>TACTICAL: remove or revisit when Mojibuster's UTF-8 grammar
+ * check is replaced with a probabilistic decoder that returns a
+ * confidence score directly.</p>
+ */
+ private static final double UTF8_MALFORMED_TOLERANCE = 0.005;
+
/** Windows-1252: the WHATWG-canonical default for unlabeled Western
content. */
private static final String WIN1252 = "windows-1252";
@@ -173,12 +196,18 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
* structural checks, which need byte alignment intact.
*/
public List<EncodingResult> detect(byte[] probe, Metadata metadata) {
+ if (LOG.isTraceEnabled()) {
+ int probeLen = probe == null ? 0 : probe.length;
+ int highBytes = probe == null ? 0 : countHighBytes(probe);
+ LOG.trace("mojibuster enter probe={}B highBytes={}", probeLen,
highBytes);
+ }
// Empty / near-empty probes: return the WHATWG default so
// downstream callers don't see an empty list (which propagates
// up as "Failed to detect the character encoding of a
// document" in TXTParser / RFC822Parser / etc). windows-1252
// at low confidence lets any declarative hint override.
if (probe == null || probe.length < 2) {
+ LOG.trace("mojibuster -> windows-1252 fallback (probe<2B)");
return windows1252Fallback();
}
@@ -190,6 +219,7 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
// consulting NB so we don't hand back a bias-driven x-MacRoman
// or IBM850 pick.
if (isPureAscii(probe)) {
+ LOG.trace("mojibuster -> windows-1252 fallback (pure ASCII)");
return windows1252Fallback();
}
@@ -207,6 +237,8 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
// UTF-32 codepoint validity — structural candidate. Also
// collects UTF-16 surrogate invalidity flags used below.
WideUnicodeDetector.Result wide = WideUnicodeDetector.analyze(probe);
+ LOG.trace("mojibuster wideUnicode charset={} invalidLE={}
invalidBE={}",
+ wide.charset, wide.invalidUtf16Le, wide.invalidUtf16Be);
if (wide.charset != null) {
pool.add(new EncodingResult(wide.charset, UTF32_STRUCTURAL_CONF,
wide.charset.name(),
EncodingResult.ResultType.STRUCTURAL));
@@ -222,14 +254,19 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
// discriminate UTF-16 reliably (see
// why-stride1-bigrams-dont-work-for-utf16.md), so we keep
// UTF-16 out of NB training and delegate to the specialist.
- if (StructuralEncodingRules.has2ByteColumnAsymmetryEvidence(probe)) {
+ boolean utf16Gate =
StructuralEncodingRules.has2ByteColumnAsymmetryEvidence(probe);
+ LOG.trace("mojibuster utf16Gate={}", utf16Gate);
+ if (utf16Gate) {
List<EncodingResult> utf16Results = utf16.detect(probe);
+ LOG.trace("mojibuster utf16Specialist returned {} candidates",
utf16Results.size());
for (EncodingResult r : utf16Results) {
String name = r.getCharset().name();
boolean invalid =
("UTF-16LE".equals(name) && wide.invalidUtf16Le)
|| ("UTF-16BE".equals(name) && wide.invalidUtf16Be);
+ LOG.trace("mojibuster utf16Specialist candidate={}
invalid={}", name, invalid);
if (!invalid) {
+ LOG.trace("mojibuster -> utf16 short-circuit {}", name);
return List.of(new EncodingResult(r.getCharset(),
UTF32_STRUCTURAL_CONF, r.getLabel(),
EncodingResult.ResultType.STRUCTURAL));
@@ -251,6 +288,29 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
// • AMBIGUOUS (pure ASCII or only truncated lead): no
// emission; NB + fallbacks handle it.
StructuralEncodingRules.Utf8Result utf8 =
StructuralEncodingRules.checkUtf8(probe);
+ // TACTICAL: tolerate small corruption. If the grammar check returned
+ // NOT_UTF8 but the malformed-byte fraction is tiny, treat as UTF-8 —
+ // a single bad continuation byte in 2KB of CJK is nearly always
+ // corruption, not "this isn't UTF-8". Remove when grammar check is
+ // replaced with a probabilistic decoder.
+ boolean utf8Tolerated = false;
+ if (utf8 == StructuralEncodingRules.Utf8Result.NOT_UTF8) {
+ int errors = StructuralEncodingRules.countUtf8Errors(probe);
+ if (errors > 0
+ && (double) errors / probe.length <=
UTF8_MALFORMED_TOLERANCE) {
+ utf8Tolerated = true;
+ LOG.trace("mojibuster utf8 NOT_UTF8 tolerated: {} error events
in {}B ({}%)",
+ errors, probe.length,
+ String.format(Locale.ROOT, "%.3f",
+ 100.0 * errors / probe.length));
+ } else if (errors > 0) {
+ LOG.trace("mojibuster utf8 NOT_UTF8 NOT tolerated: {} error
events in {}B ({}%)",
+ errors, probe.length,
+ String.format(Locale.ROOT, "%.3f",
+ 100.0 * errors / probe.length));
+ }
+ }
+ LOG.trace("mojibuster utf8Check={} tolerated={}", utf8, utf8Tolerated);
if (utf8 == StructuralEncodingRules.Utf8Result.LIKELY_UTF8) {
pool.add(new EncodingResult(
java.nio.charset.StandardCharsets.UTF_8,
@@ -267,11 +327,22 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
// Naive-Bayes top-K candidates — statistical.
List<EncodingResult> nbResults = nb.detect(nbInput);
+ if (LOG.isTraceEnabled()) {
+ StringBuilder sb = new StringBuilder();
+ for (EncodingResult r : nbResults) {
+ if (sb.length() > 0) sb.append(", ");
+ sb.append(r.getCharset().name())
+ .append("@").append(String.format(Locale.ROOT, "%.2f",
r.getConfidence()));
+ }
+ LOG.trace("mojibuster nb({}B input) -> [{}]", nbInput.length, sb);
+ }
for (EncodingResult r : nbResults) {
String name = r.getCharset().name();
- // NOT_UTF8 disqualifier.
+ // NOT_UTF8 disqualifier — applied unless the malformed-byte
+ // fraction is tiny (see UTF8_MALFORMED_TOLERANCE).
if ("UTF-8".equals(name)
- && utf8 == StructuralEncodingRules.Utf8Result.NOT_UTF8) {
+ && utf8 == StructuralEncodingRules.Utf8Result.NOT_UTF8
+ && !utf8Tolerated) {
continue;
}
pool.add(r);
@@ -281,7 +352,30 @@ public class MojibusterEncodingDetector implements
EncodingDetector {
// Low-evidence Latin-sibling → windows-1252 rewrite. Runs
// after sort so only the final top candidate is considered
// for the rewrite, preserving lower-ranked siblings.
- return applyLatinSiblingFallback(probe, ranked);
+ List<EncodingResult> finalResults = applyLatinSiblingFallback(probe,
ranked);
+ // Never return an empty list. An empty result propagates up as
+ // "Failed to detect the character encoding of a document" in
+ // AutoDetectReader.detect, which kills parsing entirely. When
+ // every layer has rejected its candidates (NOT_UTF8 disqualifier
+ // dropped NB's only pick, NB returned no candidates at all,
+ // wide-Unicode and UTF-16 specialists abstained), fall back to
+ // the WHATWG default. Downstream JunkFilter / declarative
+ // candidates can still override at low confidence.
+ if (finalResults.isEmpty()) {
+ LOG.trace("mojibuster pool empty -> windows-1252 fallback");
+ return windows1252Fallback();
+ }
+ if (LOG.isTraceEnabled()) {
+ StringBuilder sb = new StringBuilder();
+ for (EncodingResult r : finalResults) {
+ if (sb.length() > 0) sb.append(", ");
+ sb.append(r.getCharset().name())
+ .append("[").append(r.getResultType()).append("]")
+ .append("@").append(String.format(Locale.ROOT, "%.2f",
r.getConfidence()));
+ }
+ LOG.trace("mojibuster exit ({} results) [{}]",
finalResults.size(), sb);
+ }
+ return finalResults;
}
/**
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
index d220031bac..2c46a8db3a 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
@@ -749,6 +749,114 @@ public final class StructuralEncodingRules {
return Utf8Result.AMBIGUOUS;
}
+ /**
+ * Counts the number of malformed UTF-8 <em>sequences</em> in the sample —
+ * one event per bad lead, orphaned continuation, overlong, surrogate, or
+ * out-of-range codepoint, regardless of how many bytes the bad sequence
+ * spans. Unlike {@link #checkUtf8}, this does not early-exit on the
+ * first bad sequence; it scans the entire range, resyncing after each
+ * error. Returns 0 for a clean UTF-8 stream.
+ *
+ * <p>Useful for "tolerant" UTF-8 acceptance: a real-world UTF-8 file with
+ * a few corrupted sequences (copy-paste artefact, truncated upstream,
+ * MIME transport flip) should still be recognized as UTF-8 rather than
+ * rejected outright. Caller decides what error count is tolerable
+ * (typically as a fraction of probe length).</p>
+ *
+ * <p>The count matches Java's {@code new String(bytes, UTF_8)}'s
+ * U+FFFD-per-error semantics (one replacement per malformed sequence).</p>
+ *
+ * @return number of malformed UTF-8 sequence events
+ */
+ public static int countUtf8Errors(byte[] bytes) {
+ return countUtf8Errors(bytes, 0, bytes.length);
+ }
+
+ public static int countUtf8Errors(byte[] bytes, int offset, int length) {
+ int errors = 0;
+ int i = offset;
+ int end = offset + length;
+ while (i < end) {
+ int b = bytes[i] & 0xFF;
+ if (b < 0x80) {
+ i++;
+ continue;
+ }
+ int seqLen;
+ if (b >= 0xF8) {
+ // 5-/6-byte sequences are not valid Unicode
+ errors++;
+ i++;
+ continue;
+ } else if (b >= 0xF0) {
+ seqLen = 4;
+ } else if (b >= 0xE0) {
+ seqLen = 3;
+ } else if (b >= 0xC0) {
+ seqLen = 2;
+ } else {
+ // continuation byte without a lead
+ errors++;
+ i++;
+ continue;
+ }
+ if (seqLen == 2 && b <= 0xC1) {
+ // overlong
+ errors++;
+ i++;
+ continue;
+ }
+ int kEnd = Math.min(seqLen, end - i);
+ // Truncated at probe-end is not an error — just stop here.
+ if (kEnd < seqLen) {
+ i = end;
+ break;
+ }
+ // Verify continuations are well-formed
+ boolean bad = false;
+ for (int k = 1; k < seqLen; k++) {
+ int cb = bytes[i + k] & 0xFF;
+ if (cb < 0x80 || cb > 0xBF) {
+ bad = true;
+ break;
+ }
+ }
+ if (bad) {
+ errors++;
+ // Skip the whole intended sequence. Advancing byte-by-byte
+ // would re-count the orphaned continuations as additional
+ // errors and inflate the count above Java's UTF-8 decoder's
+ // U+FFFD-per-event semantics, which is the convention we
+ // match for caller threshold comparisons.
+ i += seqLen;
+ continue;
+ }
+ // Codepoint range / surrogate checks
+ if (seqLen == 3) {
+ int cp = ((b & 0x0F) << 12)
+ | ((bytes[i + 1] & 0xFF) & 0x3F) << 6
+ | ((bytes[i + 2] & 0xFF) & 0x3F);
+ if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
+ errors++;
+ i += seqLen;
+ continue;
+ }
+ } else if (seqLen == 4) {
+ int cp = ((b & 0x07) << 18)
+ | ((bytes[i + 1] & 0xFF) & 0x3F) << 12
+ | ((bytes[i + 2] & 0xFF) & 0x3F) << 6
+ | ((bytes[i + 3] & 0xFF) & 0x3F);
+ if (cp < 0x10000 || cp > 0x10FFFF) {
+ errors++;
+ i += seqLen;
+ continue;
+ }
+ }
+ i += seqLen;
+ }
+ return errors;
+ }
+
// -----------------------------------------------------------------------
// Result type
// -----------------------------------------------------------------------
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index fb57dd6f93..fb903f5b00 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -39,6 +39,7 @@ import org.apache.tika.ml.chardetect.HtmlByteStripper;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.quality.TextQualityComparison;
import org.apache.tika.quality.TextQualityDetector;
+import org.apache.tika.quality.TextQualityScore;
/**
* A {@link MetaEncodingDetector} that arbitrates charset candidates by
@@ -75,6 +76,34 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
* default read limit used by the charset base detectors. */
private static final int DEFAULT_READ_LIMIT = 16384;
+ // ---------------------------------------------------------------------
+ // TACTICAL: declarative-override gate constants.
+ //
+ // These exist to compensate for known per-script calibration unevenness
+ // in the quality scorer (HAN noise floor too generous; MALAYALAM/TAMIL/
+ // BENGALI floors too strict). They produce wrong tournaments when an
+ // honest in-document declaration (`<meta charset>` / XML decl) decodes
+ // to sparse non-Latin content that scores junky-but-correct, while a
+ // statistical pick decodes to dense mojibake-Han that scores decent-
+ // but-wrong. See `analyses/2026-04-26-tika-eval-charset-and-other.md`
+ // and the indic-collapse + Korean+Hanja fixtures.
+ //
+ // REMOVE when the quality scorer is recalibrated per-script — the
+ // tournament should then be reliable on its own.
+ // ---------------------------------------------------------------------
+
+ /** Maximum delta in z-score units we tolerate before honoring the
+ * in-document declaration over the tournament winner. Tuned so that
+ * small same-script-different-codepage deltas (windows-1252 vs
+ * windows-1257 ≈ 1-2 units) don't trigger override when scripts
+ * match, while indic-vs-mojibake-Han deltas (~3-5 units) do. */
+ private static final float DECLARATIVE_OVERRIDE_MAX_DELTA = 6.0f;
+
+ /** Maximum fraction of REPLACEMENT CHARACTER (U+FFFD) in the declared
+ * decoder's output. Above this, the declared charset clearly cannot
+ * decode the bytes and we should not honor the declaration. */
+ private static final double DECLARATIVE_MAX_FFFD_RATE = 0.01;
+
/** Cached quality detector. {@code null} if none is on the classpath. */
private final TextQualityDetector qualityDetector;
@@ -148,7 +177,10 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
byte[] stripDst = new byte[bytes.length];
HtmlByteStripper.Result stripped =
HtmlByteStripper.strip(bytes, 0, bytes.length, stripDst, 0);
- if (stripped.tagCount > 0 && stripped.length > 0) {
+ boolean stripUsed = stripped.tagCount > 0 && stripped.length > 0;
+ LOG.trace("junk-filter strip: input={}B tagCount={} stripped={}B
used={}",
+ bytes.length, stripped.tagCount, stripped.length, stripUsed);
+ if (stripUsed) {
forDecode = new byte[stripped.length];
System.arraycopy(stripDst, 0, forDecode, 0, stripped.length);
}
@@ -160,10 +192,22 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
String decoded = safeDecode(forDecode, cs);
if (decoded != null && !decoded.isEmpty()) {
candidates.put(cs, decoded);
+ if (LOG.isTraceEnabled()) {
+ int sampleLen = Math.min(400, decoded.length());
+ String sample = decoded.substring(0, sampleLen)
+ .replace('\n', ' ').replace('\r', ' ');
+ LOG.trace("junk-filter decoded {}: '{}{}' (len={})",
+ cs.name(), sample,
+ decoded.length() > sampleLen ? "…" : "",
+ decoded.length());
+ }
+ } else {
+ LOG.trace("junk-filter decode {} -> null/empty", cs.name());
}
}
if (candidates.size() <= 1) {
// One or zero candidates produced usable text; nothing to compare.
+ LOG.trace("junk-filter <=1 usable candidate, abstaining");
return Collections.emptyList();
}
// When a DECLARATIVE candidate decodes byte-identically to at least
@@ -175,6 +219,8 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
if (declared != null) {
float conf = context.getTopConfidenceFor(declared);
context.setArbitrationInfo("junk-filter-prefer-declarative");
+ LOG.trace("junk-filter -> {} (declarative with equivalent decode)",
+ declared.name());
return List.of(new EncodingResult(declared, conf));
}
if (allDecodingsIdentical(candidates)) {
@@ -182,6 +228,7 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
// is present. Text quality cannot distinguish them; defer to
// the composite's default ordering.
context.setArbitrationInfo("junk-filter-identical-decodings");
+ LOG.trace("junk-filter all decodings identical, deferring");
return Collections.emptyList();
}
@@ -189,21 +236,158 @@ public class JunkFilterEncodingDetector implements
MetaEncodingDetector {
// every subsequent candidate challenges the current champion.
Iterator<Map.Entry<Charset, String>> it =
candidates.entrySet().iterator();
Map.Entry<Charset, String> champion = it.next();
+ LOG.trace("junk-filter tournament seed: {}", champion.getKey().name());
while (it.hasNext()) {
Map.Entry<Charset, String> challenger = it.next();
TextQualityComparison cmp = qualityDetector.compare(
champion.getKey().name(), champion.getValue(),
challenger.getKey().name(), challenger.getValue());
+ LOG.trace("junk-filter compare {} vs {} -> {} (delta={} A={}
B={})",
+ champion.getKey().name(), challenger.getKey().name(),
+ cmp.winner(), String.format(java.util.Locale.ROOT, "%.3f",
cmp.delta()),
+ cmp.scoreA(), cmp.scoreB());
if ("B".equals(cmp.winner())) {
champion = challenger;
}
}
+ LOG.trace("junk-filter -> {} (tournament champion)",
champion.getKey().name());
+
+ // TACTICAL: declarative override. See class-level comment block.
+ // REMOVE when quality scorer is recalibrated per-script.
+ Charset declarativeOverride = applyInDocumentDeclarativeOverride(
+ context, candidates, champion.getKey());
+ if (declarativeOverride != null) {
+ float conf = context.getTopConfidenceFor(declarativeOverride);
+ context.setArbitrationInfo("junk-filter-declarative-override");
+ LOG.trace("junk-filter -> {} (declarative override of tournament
winner {})",
+ declarativeOverride.name(), champion.getKey().name());
+ return List.of(new EncodingResult(declarativeOverride, conf));
+ }
float confidence = context.getTopConfidenceFor(champion.getKey());
context.setArbitrationInfo("junk-filter-selected");
return List.of(new EncodingResult(champion.getKey(), confidence));
}
+ /**
+ * Tactical fix: honor an in-document {@code <meta charset>} or XML
+ * declaration when the quality scorer's per-script calibration unevenness
+ * would otherwise mis-rank candidates of <em>different scripts</em>.
+ *
+ * <p>Returns the in-document declared charset to use, or {@code null} to
+ * leave the tournament winner intact.</p>
+ *
+ * <p>Gates (all must hold to override):</p>
+ * <ol>
+ * <li><strong>(a) Decode is mostly clean</strong>: declared decoder
produces
+ * fewer than {@link #DECLARATIVE_MAX_FFFD_RATE} U+FFFD per
char.</li>
+ * <li><strong>(b) Both decoded</strong>: declared and tournament winner
are
+ * both in the candidate map (already guaranteed by upstream
code).</li>
+ * <li><strong>(c) Quality gap small</strong>: tournament winner's
z-score
+ * is not vastly higher than the declared's; specifically
+ * {@code winner.z - declared.z <=
DECLARATIVE_OVERRIDE_MAX_DELTA}.</li>
+ * <li><strong>(d) Different scripts</strong>: declared and winner
classify
+ * as different scripts. Same-script Latin-cousin lies (e.g.
windows-1252
+ * declared on a windows-1257 file) fall through to the tournament,
+ * which correctly handles them via byte-distribution scoring.</li>
+ * </ol>
+ *
+ * <p>"In-document" means {@code HtmlEncodingDetector} or any future
XML-decl
+ * source — explicitly NOT {@code MetadataCharsetDetector} (outer
Content-Type
+ * header), which is more often wrong.</p>
+ */
+ private Charset applyInDocumentDeclarativeOverride(
+ EncodingDetectorContext context,
+ Map<Charset, String> candidates,
+ Charset champion) {
+ Charset declared = findInDocumentDeclarative(context);
+ if (declared == null) {
+ return null;
+ }
+ if (declared.equals(champion)) {
+ return null; // already winning
+ }
+ // Per HTML5 spec, <meta charset> cannot validly declare UTF-16 /
UTF-32:
+ // the meta tag itself is bytes that have to be parsed before its
+ // declaration is known, and UTF-16/32 require a BOM. If the
+ // declaration claims UTF-16/32 and no BOM was found (BOMDetector runs
+ // first in the chain), we treat the declaration as invalid and let
+ // the tournament winner stand. This catches govdocs1-style "utf-16
+ // declared on a Latin file" lies that would otherwise look like a
+ // legitimate script-mismatch override.
+ String declaredName = declared.name();
+ if (declaredName.startsWith("UTF-16") ||
declaredName.startsWith("UTF-32")) {
+ LOG.trace("junk-filter declarative-override skipped: UTF-16/32 in
<meta> (HTML5 invalid)");
+ return null;
+ }
+ String championText = candidates.get(champion);
+ String declaredText = candidates.get(declared);
+ if (declaredText == null || championText == null) {
+ return null; // failed to decode
+ }
+ // (a) decode mostly clean
+ double fffdRate = replacementCharRate(declaredText);
+ if (fffdRate > DECLARATIVE_MAX_FFFD_RATE) {
+ LOG.trace("junk-filter declarative-override skipped: U+FFFD rate
{} > {}",
+ fffdRate, DECLARATIVE_MAX_FFFD_RATE);
+ return null;
+ }
+ TextQualityScore declaredScore = qualityDetector.score(declaredText);
+ TextQualityScore championScore = qualityDetector.score(championText);
+ // (c) winner not vastly higher
+ float delta = championScore.getZScore() - declaredScore.getZScore();
+ if (delta > DECLARATIVE_OVERRIDE_MAX_DELTA) {
+ LOG.trace("junk-filter declarative-override skipped: delta {} >
{}",
+ delta, DECLARATIVE_OVERRIDE_MAX_DELTA);
+ return null;
+ }
+ // (d) different scripts
+ String declaredScript = declaredScore.getDominantScript();
+ String championScript = championScore.getDominantScript();
+ if (declaredScript == null || declaredScript.equals(championScript)) {
+ LOG.trace("junk-filter declarative-override skipped: same script
{}",
+ declaredScript);
+ return null;
+ }
+ LOG.trace("junk-filter declarative-override fires: declared={}
(script={}, z={}) vs winner={} (script={}, z={}) delta={}",
+ declared.name(), declaredScript, declaredScore.getZScore(),
+ champion.name(), championScript, championScore.getZScore(),
delta);
+ return declared;
+ }
+
+ /**
+ * Find the first in-document DECLARATIVE candidate (from
+ * {@code HtmlEncodingDetector} / XML declaration), or {@code null}.
+ * Outer Content-Type metadata ({@code MetadataCharsetDetector}) is
+ * intentionally excluded — those headers lie too often.
+ */
+ private static Charset findInDocumentDeclarative(EncodingDetectorContext
context) {
+ for (EncodingDetectorContext.Result r : context.getResults()) {
+ String name = r.getDetectorName();
+ if (("HtmlEncodingDetector".equals(name)
+ || "StandardHtmlEncodingDetector".equals(name))
+ && r.getResultType() ==
EncodingResult.ResultType.DECLARATIVE) {
+ return r.getCharset();
+ }
+ }
+ return null;
+ }
+
+ /** Fraction of {@code U+FFFD} (REPLACEMENT CHARACTER) in the decoded
String —
+ * a proxy for "this charset cannot decode these bytes". */
+ private static double replacementCharRate(String s) {
+ if (s.isEmpty()) {
+ return 0.0;
+ }
+ long count = 0;
+ for (int i = 0; i < s.length(); i++) {
+ if (s.charAt(i) == '�') {
+ count++;
+ }
+ }
+ return (double) count / s.length();
+ }
+
/**
* Return the first DECLARATIVE charset whose decoded output equals at
* least one other candidate's, or {@code null}.
diff --git
a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
new file mode 100644
index 0000000000..06427990cf
--- /dev/null
+++
b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/EntityRefProbe.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.junkdetect;
+
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.ml.chardetect.HtmlByteStripper;
+import org.apache.tika.quality.TextQualityScore;
+
+/**
+ * One-off probe: score a file's first 16KB under each candidate charset,
+ * with and without HTML-entity-ref expansion. Run via:
+ * {@code mvn -pl :tika-ml-junkdetect exec:java -Dexec.classpathScope=test
+ * -Dexec.mainClass=org.apache.tika.ml.junkdetect.EntityRefProbe
+ * -Dexec.args="<file> <charset1> [charset2] ..."}.
+ */
+public class EntityRefProbe {
+
+ private static final Pattern NUM_DEC =
+ Pattern.compile("&#(\\d{1,7});");
+ private static final Pattern NUM_HEX =
+ Pattern.compile("&#[xX]([0-9a-fA-F]{1,6});");
+ // A small set of named refs likely to appear in HTML.
+ private static final Pattern NAMED =
+ Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.err.println("usage: EntityRefProbe <file> <charset1>
[charset2] ...");
+ System.exit(2);
+ }
+ byte[] raw = Files.readAllBytes(Paths.get(args[0]));
+ if (raw.length > 16384) {
+ byte[] cap = new byte[16384];
+ System.arraycopy(raw, 0, cap, 0, 16384);
+ raw = cap;
+ }
+ // Strip HTML the same way JunkFilterEncodingDetector does.
+ byte[] dst = new byte[raw.length];
+ HtmlByteStripper.Result strip =
+ HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
+ byte[] forDecode = raw;
+ if (strip.tagCount > 0 && strip.length > 0) {
+ forDecode = new byte[strip.length];
+ System.arraycopy(dst, 0, forDecode, 0, strip.length);
+ }
+ System.out.printf("input=%dB tagCount=%d stripped=%dB%n",
+ raw.length, strip.tagCount, forDecode.length);
+
+ JunkDetector jd = JunkDetector.loadFromClasspath();
+ for (int i = 1; i < args.length; i++) {
+ String csName = args[i];
+ Charset cs = Charset.forName(csName);
+ String decoded = new String(forDecode, cs);
+ String expanded = expandEntities(decoded);
+ String removed = removeEntities(decoded);
+ TextQualityScore rawScore = jd.score(decoded);
+ TextQualityScore expScore = jd.score(expanded);
+ TextQualityScore remScore = jd.score(removed);
+ System.out.println();
+ System.out.printf("== %s ==%n", csName);
+ System.out.printf(" raw len=%-5d %s%n", decoded.length(),
rawScore);
+ System.out.printf(" expanded len=%-5d %s%n", expanded.length(),
expScore);
+ System.out.printf(" removed len=%-5d %s%n", removed.length(),
remScore);
+ int sample = Math.min(180, decoded.length());
+ System.out.printf(" raw : %s…%n",
+ decoded.substring(0, sample).replace('\n', '
').replace('\r', ' '));
+ sample = Math.min(180, expanded.length());
+ System.out.printf(" expanded : %s…%n",
+ expanded.substring(0, sample).replace('\n', '
').replace('\r', ' '));
+ sample = Math.min(180, removed.length());
+ System.out.printf(" removed : %s…%n",
+ removed.substring(0, sample).replace('\n', '
').replace('\r', ' '));
+ }
+ }
+
+ private static String expandEntities(String s) {
+ StringBuilder out = new StringBuilder(s.length());
+ Matcher mDec = NUM_DEC.matcher(s);
+ StringBuilder buf = new StringBuilder();
+ // Decimal numeric refs
+ Matcher m = mDec;
+ int last = 0;
+ while (m.find()) {
+ buf.append(s, last, m.start());
+ try {
+ int cp = Integer.parseInt(m.group(1));
+ if (Character.isValidCodePoint(cp)) {
+ buf.appendCodePoint(cp);
+ } else {
+ buf.append(m.group());
+ }
+ } catch (NumberFormatException e) {
+ buf.append(m.group());
+ }
+ last = m.end();
+ }
+ buf.append(s, last, s.length());
+ String pass1 = buf.toString();
+
+ // Hex numeric refs
+ buf = new StringBuilder();
+ m = NUM_HEX.matcher(pass1);
+ last = 0;
+ while (m.find()) {
+ buf.append(pass1, last, m.start());
+ try {
+ int cp = Integer.parseInt(m.group(1), 16);
+ if (Character.isValidCodePoint(cp)) {
+ buf.appendCodePoint(cp);
+ } else {
+ buf.append(m.group());
+ }
+ } catch (NumberFormatException e) {
+ buf.append(m.group());
+ }
+ last = m.end();
+ }
+ buf.append(pass1, last, pass1.length());
+ String pass2 = buf.toString();
+
+ // A small set of named refs
+ return pass2
+ .replace("&", "&")
+ .replace("<", "<")
+ .replace(">", ">")
+ .replace(""", "\"")
+ .replace("'", "'")
+ .replace(" ", " ")
+ .replace("©", "©")
+ .replace("®", "®");
+ }
+
+ /**
+ * Replace every numeric/named entity ref with a single space. Removal
+ * (rather than expansion) keeps the per-charset script signal clean —
+ * expansion injects Unicode codepoints that don't come from the candidate
+ * charset's bytes and can dominate the actual decoded-charset signal.
+ */
+ private static String removeEntities(String s) {
+ String r = NUM_DEC.matcher(s).replaceAll(" ");
+ r = NUM_HEX.matcher(r).replaceAll(" ");
+ r = NAMED.matcher(r).replaceAll(" ");
+ return r;
+ }
+}