This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4720-wiring in repository https://gitbox.apache.org/repos/asf/tika.git
commit 40a75367965a8c0589afed4e2f816af0e91b0966 Author: tallison <[email protected]> AuthorDate: Fri Apr 24 17:14:55 2026 -0400 TIKA-4720: add JunkFilterEncodingDetector --- tika-ml/pom.xml | 13 + tika-ml/tika-ml-junkdetect/pom.xml | 6 + .../apache/tika/ml/junkdetect/JunkDetector.java | 18 ++ .../ml/junkdetect/JunkFilterEncodingDetector.java | 301 +++++++++++++++++++++ .../junkdetect/JunkFilterEncodingDetectorTest.java | 164 +++++++++++ .../pom.xml | 6 + .../tika/config/TikaEncodingDetectorTest.java | 42 ++- .../TIKA-2485-encoding-detector-mark-limits.json | 3 + 8 files changed, 531 insertions(+), 22 deletions(-) diff --git a/tika-ml/pom.xml b/tika-ml/pom.xml index 5c9cf03af1..11ddcb221c 100644 --- a/tika-ml/pom.xml +++ b/tika-ml/pom.xml @@ -39,6 +39,19 @@ <build> <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <annotationProcessorPaths> + <path> + <groupId>org.apache.tika</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + </path> + </annotationProcessorPaths> + </configuration> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-checkstyle-plugin</artifactId> diff --git a/tika-ml/tika-ml-junkdetect/pom.xml b/tika-ml/tika-ml-junkdetect/pom.xml index 672e49195a..39eb506efd 100644 --- a/tika-ml/tika-ml-junkdetect/pom.xml +++ b/tika-ml/tika-ml-junkdetect/pom.xml @@ -43,6 +43,12 @@ <artifactId>tika-core</artifactId> <version>${revision}</version> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${revision}</version> + <scope>provided</scope> + </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-ml-core</artifactId> diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java index 07aeb64164..1719043f40 100644 --- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkDetector.java @@ -167,6 +167,24 @@ public final class JunkDetector implements TextQualityDetector { } } + /** + * {@link java.util.ServiceLoader} provider hook (Java 9+). Allows + * {@code JunkDetector} to be registered as a + * {@link org.apache.tika.quality.TextQualityDetector} SPI implementation + * even though its construction goes through + * {@link #loadFromClasspath()} rather than a public no-arg constructor. + * + * @throws UncheckedIOException if the bundled model cannot be loaded + */ + public static JunkDetector provider() { + try { + return loadFromClasspath(); + } catch (IOException e) { + throw new java.io.UncheckedIOException( + "Failed to load bundled JunkDetector model", e); + } + } + /** * Loads a model from the given file path. The file may be gzipped or raw. */ diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java new file mode 100644 index 0000000000..dcbe1bbe4b --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.detect.EncodingDetectorContext; +import org.apache.tika.detect.EncodingResult; +import org.apache.tika.detect.MetaEncodingDetector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.quality.TextQualityComparison; +import org.apache.tika.quality.TextQualityDetector; + +/** + * A {@link MetaEncodingDetector} that arbitrates charset candidates by + * asking a {@link TextQualityDetector} which decoded candidate looks + * most like natural text. + * + * <p>Each base {@link org.apache.tika.detect.EncodingDetector} in the + * {@link org.apache.tika.detect.CompositeEncodingDetector} chain emits + * candidates into the {@link EncodingDetectorContext}. This meta detector + * then reads the raw probe bytes, decodes them under each unique candidate + * charset, and runs pairwise comparisons via + * {@link TextQualityDetector#compare} to pick the candidate whose decoding + * produces the cleanest text. BOM-declared, meta-tag-declared, + * structural, and statistical candidates all compete on the same footing — + * quality of the resulting decode is the sole criterion at this layer. + * + * <p>The {@link TextQualityDetector} implementation is discovered via + * {@link ServiceLoader}. When no implementation is on the classpath, + * this detector becomes a no-op: {@link #detect} returns an empty list + * and {@link org.apache.tika.detect.CompositeEncodingDetector} falls + * back to its default confidence-based ordering. + * + * @since Apache Tika 4.0.0 (TIKA-4720) + */ +@TikaComponent(name = "junk-filter-encoding-detector") +public class JunkFilterEncodingDetector implements MetaEncodingDetector { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = + LoggerFactory.getLogger(JunkFilterEncodingDetector.class); + + /** How many probe bytes to read for decoding candidates. Matches the + * default read limit used by the charset base detectors. */ + private static final int DEFAULT_READ_LIMIT = 16384; + + /** Cached quality detector. {@code null} if none is on the classpath. */ + private final TextQualityDetector qualityDetector; + + private int readLimit = DEFAULT_READ_LIMIT; + + public JunkFilterEncodingDetector() { + // The junk detector is hardcoded rather than ServiceLoader-discovered + // so construction cannot silently fail to register a quality detector + // and leave this meta detector as a no-op. JunkDetector lives in the + // same module and loads its bundled model from the classpath. + TextQualityDetector q = null; + try { + q = JunkDetector.loadFromClasspath(); + LOG.debug("Loaded JunkDetector: {}", q.getClass().getName()); + } catch (Throwable t) { + // A broken model binary (e.g. block-table dimension mismatch + // across JVM Unicode versions) would otherwise propagate and + // prevent this meta detector from registering at all. Fail safe: + // log and operate as a no-op. + LOG.warn("Failed to load JunkDetector; JunkFilterEncodingDetector " + + "will operate as a no-op: {}", t.toString()); + } + this.qualityDetector = q; + } + + /** Test-only / deterministic-wiring constructor. */ + public JunkFilterEncodingDetector(TextQualityDetector qualityDetector) { + this.qualityDetector = qualityDetector; + } + + public int getReadLimit() { + return readLimit; + } + + public void setReadLimit(int readLimit) { + this.readLimit = readLimit; + } + + @Override + public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata, + ParseContext parseContext) throws IOException { + if (qualityDetector == null) { + return Collections.emptyList(); + } + if (parseContext == null) { + return Collections.emptyList(); + } + EncodingDetectorContext context = parseContext.get(EncodingDetectorContext.class); + if (context == null || context.getResults().isEmpty()) { + return Collections.emptyList(); + } + + Set<Charset> uniqueCharsets = context.getUniqueCharsets(); + if (uniqueCharsets.size() <= 1) { + // Nothing to arbitrate. Let the composite's default ordering + // pick the single candidate. + return Collections.emptyList(); + } + + if (tis == null) { + context.setArbitrationInfo("junk-filter-no-stream"); + return Collections.emptyList(); + } + + byte[] bytes = readProbe(tis); + if (bytes == null || bytes.length == 0) { + context.setArbitrationInfo("junk-filter-empty-stream"); + return Collections.emptyList(); + } + bytes = stripBomBytes(bytes); + + // Decode probe under each candidate, preserving insertion order so + // tournament seeding is deterministic. + Map<Charset, String> candidates = new LinkedHashMap<>(); + for (Charset cs : uniqueCharsets) { + String decoded = safeDecode(bytes, cs); + if (decoded != null && !decoded.isEmpty()) { + candidates.put(cs, decoded); + } + } + if (candidates.size() <= 1) { + // One or zero candidates produced usable text; nothing to compare. + return Collections.emptyList(); + } + if (allDecodingsIdentical(candidates)) { + // Byte-identical decodings (typical on pure-ASCII probes). + // Text quality cannot distinguish them. Prefer an author + // declaration (BOM / HTML meta charset / HTTP Content-Type) + // over statistical or structural candidates: if the document + // tells us what it is and the bytes are compatible with that + // claim, honour it. + Charset declared = pickDeclarative(context, candidates.keySet()); + if (declared != null) { + float conf = context.getTopConfidenceFor(declared); + context.setArbitrationInfo("junk-filter-prefer-declarative"); + return List.of(new EncodingResult(declared, conf)); + } + context.setArbitrationInfo("junk-filter-identical-decodings"); + return Collections.emptyList(); + } + + // Pairwise tournament: the first candidate seeds the champion slot; + // every subsequent candidate challenges the current champion. + Iterator<Map.Entry<Charset, String>> it = candidates.entrySet().iterator(); + Map.Entry<Charset, String> champion = it.next(); + while (it.hasNext()) { + Map.Entry<Charset, String> challenger = it.next(); + TextQualityComparison cmp = qualityDetector.compare( + champion.getKey().name(), champion.getValue(), + challenger.getKey().name(), challenger.getValue()); + if ("B".equals(cmp.winner())) { + champion = challenger; + } + } + + float confidence = context.getTopConfidenceFor(champion.getKey()); + context.setArbitrationInfo("junk-filter-selected"); + return List.of(new EncodingResult(champion.getKey(), confidence)); + } + + /** + * Return the first DECLARATIVE charset in {@code context} whose charset + * is also in {@code eligible}, or {@code null} if no declarative result + * matches an eligible candidate. "Eligible" = present in the candidates + * we actually decoded (i.e. excludes candidates that failed to decode). + */ + private static Charset pickDeclarative(EncodingDetectorContext context, + Set<Charset> eligible) { + for (EncodingDetectorContext.Result r : context.getResults()) { + for (EncodingResult er : r.getEncodingResults()) { + if (er.getResultType() == EncodingResult.ResultType.DECLARATIVE + && eligible.contains(er.getCharset())) { + return er.getCharset(); + } + } + } + return null; + } + + private static boolean allDecodingsIdentical(Map<Charset, String> candidates) { + String first = null; + for (String s : candidates.values()) { + if (first == null) { + first = s; + } else if (!first.equals(s)) { + return false; + } + } + return true; + } + + private byte[] readProbe(TikaInputStream tis) throws IOException { + try { + tis.mark(readLimit); + byte[] buf = new byte[readLimit]; + int total = 0; + int read; + while (total < readLimit + && (read = tis.read(buf, total, readLimit - total)) != -1) { + total += read; + } + if (total == 0) { + return null; + } + if (total < readLimit) { + byte[] trimmed = new byte[total]; + System.arraycopy(buf, 0, trimmed, 0, total); + return trimmed; + } + return buf; + } finally { + tis.reset(); + } + } + + private static String safeDecode(byte[] bytes, Charset charset) { + try { + return new String(bytes, charset); + } catch (Exception e) { + LOG.debug("Decode failed for {}: {}", charset.name(), e.toString()); + return null; + } + } + + /** + * Strip a leading byte-order mark, if any. UTF-32 signatures are + * checked before UTF-16 because the UTF-32 LE BOM ({@code FF FE 00 00}) + * starts with the UTF-16 LE BOM ({@code FF FE}). + */ + private static byte[] stripBomBytes(byte[] bytes) { + int bomLen = bomLength(bytes); + if (bomLen == 0) { + return bytes; + } + return Arrays.copyOfRange(bytes, bomLen, bytes.length); + } + + private static int bomLength(byte[] b) { + if (b.length >= 4 + && (b[0] & 0xFF) == 0x00 && (b[1] & 0xFF) == 0x00 + && (b[2] & 0xFF) == 0xFE && (b[3] & 0xFF) == 0xFF) { + return 4; // UTF-32BE + } + if (b.length >= 4 + && (b[0] & 0xFF) == 0xFF && (b[1] & 0xFF) == 0xFE + && (b[2] & 0xFF) == 0x00 && (b[3] & 0xFF) == 0x00) { + return 4; // UTF-32LE + } + if (b.length >= 3 + && (b[0] & 0xFF) == 0xEF && (b[1] & 0xFF) == 0xBB + && (b[2] & 0xFF) == 0xBF) { + return 3; // UTF-8 + } + if (b.length >= 2 + && (b[0] & 0xFF) == 0xFE && (b[1] & 0xFF) == 0xFF) { + return 2; // UTF-16BE + } + if (b.length >= 2 + && (b[0] & 0xFF) == 0xFF && (b[1] & 0xFF) == 0xFE) { + return 2; // UTF-16LE + } + return 0; + } + +} diff --git a/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java new file mode 100644 index 0000000000..0b97a9a0bd --- /dev/null +++ b/tika-ml/tika-ml-junkdetect/src/test/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetectorTest.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.junkdetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.detect.EncodingDetectorContext; +import org.apache.tika.detect.EncodingResult; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.quality.TextQualityComparison; +import org.apache.tika.quality.TextQualityDetector; +import org.apache.tika.quality.TextQualityScore; + +/** + * Unit tests for {@link JunkFilterEncodingDetector}. + * + * <p>Uses a stub {@link TextQualityDetector} rather than the real + * {@link JunkDetector} — we are testing arbitration control flow, not + * the quality of the junk detector's decisions. + */ +public class JunkFilterEncodingDetectorTest { + + /** Stub quality detector: always picks the label matching {@link #preferred}. */ + private static final class PreferenceStub implements TextQualityDetector { + private final String preferred; + + PreferenceStub(String preferred) { + this.preferred = preferred; + } + + @Override + public TextQualityScore score(String text) { + return new TextQualityScore(Float.NaN, Float.NaN, Float.NaN, + Float.NaN, "UNKNOWN"); + } + + @Override + public TextQualityComparison compare(String labelA, String candidateA, + String labelB, String candidateB) { + String winner = preferred.equals(labelA) ? "A" + : preferred.equals(labelB) ? "B" : "A"; + return new TextQualityComparison(winner, 0.0f, + score(candidateA), score(candidateB), labelA, labelB); + } + } + + private static ParseContext contextWith(EncodingResult... results) { + EncodingDetectorContext ctx = new EncodingDetectorContext(); + ctx.addResult(List.of(results), "stub"); + ParseContext p = new ParseContext(); + p.set(EncodingDetectorContext.class, ctx); + return p; + } + + @Test + public void picksPreferredCharsetFromTwoCandidates() throws Exception { + Charset utf8 = StandardCharsets.UTF_8; + Charset win1252 = Charset.forName("windows-1252"); + // Non-ASCII bytes so UTF-8 and windows-1252 decode to different strings + // (otherwise arbiter sees identical decodings and abstains). + byte[] bytes = "café résumé naïve".getBytes(StandardCharsets.UTF_8); + + ParseContext pc = contextWith( + new EncodingResult(utf8, 0.5f, "UTF-8", + EncodingResult.ResultType.STATISTICAL), + new EncodingResult(win1252, 0.5f, "windows-1252", + EncodingResult.ResultType.STATISTICAL)); + + JunkFilterEncodingDetector detector = + new JunkFilterEncodingDetector(new PreferenceStub("UTF-8")); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + List<EncodingResult> out = detector.detect(tis, new Metadata(), pc); + assertEquals(1, out.size(), "Expected exactly one result"); + assertEquals(utf8, out.get(0).getCharset()); + } + } + + @Test + public void noopWhenNoQualityDetector() throws Exception { + byte[] bytes = "hello".getBytes(StandardCharsets.UTF_8); + ParseContext pc = contextWith( + new EncodingResult(StandardCharsets.UTF_8, 0.5f, "UTF-8", + EncodingResult.ResultType.STATISTICAL), + new EncodingResult(Charset.forName("windows-1252"), 0.5f, + "windows-1252", EncodingResult.ResultType.STATISTICAL)); + + JunkFilterEncodingDetector detector = + new JunkFilterEncodingDetector((TextQualityDetector) null); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + List<EncodingResult> out = detector.detect(tis, new Metadata(), pc); + assertTrue(out.isEmpty(), + "No TextQualityDetector → detector must be a no-op"); + } + } + + @Test + public void noopWhenOnlyOneCandidate() throws Exception { + byte[] bytes = "hello".getBytes(StandardCharsets.UTF_8); + ParseContext pc = contextWith( + new EncodingResult(StandardCharsets.UTF_8, 0.9f, "UTF-8", + EncodingResult.ResultType.DECLARATIVE)); + + JunkFilterEncodingDetector detector = + new JunkFilterEncodingDetector(new PreferenceStub("UTF-8")); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + List<EncodingResult> out = detector.detect(tis, new Metadata(), pc); + assertTrue(out.isEmpty(), + "Single candidate → no arbitration needed, no-op"); + } + } + + @Test + public void noopWhenAllDecodingsIdentical() throws Exception { + // Pure-ASCII bytes decode identically under UTF-8 and windows-1252. + byte[] bytes = "plain ascii content".getBytes(StandardCharsets.US_ASCII); + + ParseContext pc = contextWith( + new EncodingResult(StandardCharsets.UTF_8, 0.5f, "UTF-8", + EncodingResult.ResultType.STATISTICAL), + new EncodingResult(Charset.forName("windows-1252"), 0.5f, + "windows-1252", EncodingResult.ResultType.STATISTICAL)); + + JunkFilterEncodingDetector detector = + new JunkFilterEncodingDetector(new PreferenceStub("UTF-8")); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + List<EncodingResult> out = detector.detect(tis, new Metadata(), pc); + assertTrue(out.isEmpty(), + "Byte-identical decodings → arbiter abstains"); + assertEquals("junk-filter-identical-decodings", + pc.get(EncodingDetectorContext.class).getArbitrationInfo()); + } + } + + // NOTE: a full default-constructor integration test (which would load + // the bundled JunkDetector via ServiceLoader) is not included here + // because JunkDetector currently exposes only static factory methods + // (loadFromClasspath / loadFromPath / load) and has no public no-arg + // constructor — ServiceLoader cannot instantiate it. Wiring JunkDetector + // up as a proper SPI provider is tracked as follow-up work for TIKA-4720; + // at that point this test can be added to exercise the real SPI path. +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml index f2dfdbb122..1bee0907b0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/pom.xml @@ -95,6 +95,12 @@ <version>${project.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-ml-junkdetect</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-serialization</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index a7910c9f77..931b0df0c6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -28,7 +28,6 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.apache.tika.TikaLoaderHelper; @@ -38,6 +37,7 @@ import org.apache.tika.detect.BOMDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.detect.EncodingResult; +import org.apache.tika.detect.MetaEncodingDetector; import org.apache.tika.detect.MetadataCharsetDetector; import org.apache.tika.detect.OverrideEncodingDetector; import org.apache.tika.exception.TikaConfigException; @@ -61,10 +61,12 @@ public class TikaEncodingDetectorTest extends TikaTest { EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors(); assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - // 4 base detectors (BOM, Metadata, ML, HtmlEncodingDetector); no MetaEncodingDetector in default chain - assertEquals(4, detectors.size()); + // 4 base detectors (BOM, Metadata, ML, HtmlEncodingDetector) + JunkFilter (MetaEncodingDetector) + assertEquals(5, detectors.size()); + // meta detector is always last (partitioned by CompositeEncodingDetector) + assertTrue(detectors.get(4) instanceof MetaEncodingDetector); // base detectors — sorted by full class name; check by type - Set<Class<?>> baseClasses = detectors.stream() + Set<Class<?>> baseClasses = detectors.subList(0, 4).stream() .map(Object::getClass).collect(Collectors.toSet()); assertTrue(baseClasses.contains(BOMDetector.class)); assertTrue(baseClasses.contains(MetadataCharsetDetector.class)); @@ -85,13 +87,14 @@ public class TikaEncodingDetectorTest extends TikaTest { assertTrue(detector1 instanceof CompositeEncodingDetector); List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors(); - // BOM + Metadata + ML base detectors (html excluded, no meta) - assertEquals(3, detectors1Children.size()); - Set<Class<?>> innerClasses = detectors1Children.stream() + // BOM + Metadata + ML base detectors + JunkFilter meta (html excluded) + assertEquals(4, detectors1Children.size()); + Set<Class<?>> innerClasses = detectors1Children.subList(0, 3).stream() .map(Object::getClass).collect(Collectors.toSet()); assertTrue(innerClasses.contains(BOMDetector.class)); assertTrue(innerClasses.contains(MetadataCharsetDetector.class)); assertTrue(innerClasses.contains(MojibusterEncodingDetector.class)); + assertTrue(detectors1Children.get(3) instanceof MetaEncodingDetector); assertTrue(detectors.get(1) instanceof OverrideEncodingDetector); @@ -183,9 +186,9 @@ public class TikaEncodingDetectorTest extends TikaTest { ((AbstractEncodingDetectorParser) encodingDetectingParser) .getEncodingDetector(); assertTrue(encodingDetector instanceof CompositeEncodingDetector); - // BOM, Metadata, ML, Html base detectors - // (ICU4J is excluded but was already not in the default chain; no meta) - assertEquals(4, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); + // BOM, Metadata, ML, Html base detectors + JunkFilter meta + // (ICU4J is excluded but was already not in the default chain) + assertEquals(5, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector) .getDetectors()) { assertNotContained("cu4j", child.getClass().getCanonicalName()); @@ -212,8 +215,8 @@ public class TikaEncodingDetectorTest extends TikaTest { assertTrue(encodingDetector instanceof CompositeEncodingDetector); List<EncodingDetector> children = ((CompositeEncodingDetector) encodingDetector).getDetectors(); - // 3 base detectors, no meta - assertEquals(3, children.size(), childParser.getClass().toString()); + // 3 base detectors + 1 MetaEncodingDetector (JunkFilter) + assertEquals(4, children.size(), childParser.getClass().toString()); assertTrue(children.get(0) instanceof MojibusterEncodingDetector, childParser.getClass().toString()); HtmlEncodingDetector htmlDet = (HtmlEncodingDetector) children.get(1); @@ -221,12 +224,11 @@ public class TikaEncodingDetectorTest extends TikaTest { childParser.getClass().toString()); assertTrue(children.get(2) instanceof StandardHtmlEncodingDetector, childParser.getClass().toString()); + assertTrue(children.get(3) instanceof MetaEncodingDetector, + childParser.getClass().toString()); } } - @Disabled("TIKA-4720: needs JunkFilterEncodingDetector meta-arbiter so the " - + "mark-limit-raised HTML detector's DECLARATIVE UTF-8 beats NB's " - + "STATISTICAL windows-1252") @Test public void testMarkLimitIntegration() throws Exception { StringBuilder sb = new StringBuilder(); @@ -279,14 +281,10 @@ public class TikaEncodingDetectorTest extends TikaTest { /** * ASCII HTML with an explicit {@code <meta charset="UTF-8">} must be * detected as UTF-8. The HTML detector produces a DECLARATIVE UTF-8 - * result which outranks the statistical windows-1252 fallback. - * - * <p>Disabled pending reinstatement of a {@code MetaEncodingDetector} - * (TIKA-4720): without one in the chain, Mojibuster's STATISTICAL - * windows-1252 beats the HTML detector's DECLARATIVE UTF-8. Re-enable - * once {@code JunkFilterEncodingDetector} lands. + * result; {@code JunkFilterEncodingDetector} arbitrates the tie in its + * favour (pure-ASCII bytes decode identically as UTF-8 and windows-1252, + * so the DECLARATIVE hint wins). */ - @Disabled("TIKA-4720: needs JunkFilterEncodingDetector meta-arbiter") @Test public void testAsciiHtmlWithMetaIsDetectedAsUtf8() throws Exception { byte[] bytes = diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json index 7331a02325..d4f2483cad 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json @@ -12,6 +12,9 @@ "standard-html-encoding-detector": { "markLimit": 100000 } + }, + { + "junk-filter-encoding-detector": {} } ] }
