This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4720-wiring in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4644cbeb17c63d5eb0c4813a2cbbec6d545461ee Author: tallison <[email protected]> AuthorDate: Fri Apr 24 08:18:11 2026 -0400 TIKA-4720: restore MojibusterEncodingDetector name --- .../ROOT/pages/advanced/charset-detection-design.adoc | 6 +++--- .../modules/ROOT/pages/configuration/encoding-detectors.adoc | 8 ++++---- .../src/main/java/org/apache/tika/detect/BOMDetector.java | 2 +- .../services/org.apache.tika.detect.EncodingDetector | 2 +- .../tika-encoding-detector-mojibuster/pom.xml | 2 +- .../org/apache/tika/ml/chardetect/CharsetConfusables.java | 2 +- ...EncodingDetector.java => MojibusterEncodingDetector.java} | 10 +++++----- .../tika/ml/chardetect/Utf16SpecialistEncodingDetector.java | 6 +++--- .../org/apache/tika/ml/chardetect/WideUnicodeDetector.java | 4 ++-- .../services/org.apache.tika.detect.EncodingDetector | 2 +- .../tika/ml/chardetect/SparseLatinVcardRegressionTest.java | 2 +- .../apache/tika/ml/chardetect/ZipFilenameDetectionTest.java | 2 +- .../tika/ml/chardetect/tools/BenchmarkCharsetDetectors.java | 6 +++--- .../tika/ml/chardetect/tools/BuildCharsetTrainingData.java | 2 +- .../tika/ml/chardetect/tools/EvalCharsetDetectors.java | 4 ++-- .../org/apache/tika/config/TikaEncodingDetectorTest.java | 12 ++++++------ .../configs/TIKA-2485-encoding-detector-mark-limits.json | 2 +- .../java/org/apache/tika/parser/html/HtmlParserTest.java | 2 +- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc index a3d684295a..b2bd20791a 100644 --- a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc +++ b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc @@ -51,7 +51,7 @@ results are collected into an `EncodingDetectorContext` on the US-ASCII → windows-1252). DECLARATIVE. | 3 -| `NaiveBayesPipelineEncodingDetector` +| `MojibusterEncodingDetector` | `tika-encoding-detector-mojibuster` | Structural UTF-32 and UTF-16 detection, UTF-8 grammar gate, HTML attribute-aware stripping, then a 33-class byte-bigram NB @@ -90,7 +90,7 @@ See <<opting-out-of-arbitration>> for HTML5-spec-compliant behavior that honours declarations without second-guessing. [[nb-pipeline]] -== NaiveBayesPipelineEncodingDetector +== MojibusterEncodingDetector The pipeline runs layers in order and returns the first confident structural answer when one fires; otherwise it falls through to NB. @@ -286,7 +286,7 @@ then runs in first-match-wins mode: { "bom-detector": {} }, { "metadata-charset-detector": {} }, { "standard-html-encoding-detector": {} }, - { "naive-bayes-pipeline-encoding-detector": {} } + { "mojibuster-encoding-detector": {} } ] } ---- diff --git a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc index 8ff335734e..16684ea172 100644 --- a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc +++ b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc @@ -42,7 +42,7 @@ Emits DECLARATIVE. (populated from an HTTP response header or similar). Emits DECLARATIVE. |3 -|`naive-bayes-pipeline-encoding-detector` +|`mojibuster-encoding-detector` |A structural UTF-32 check, structural UTF-16 specialist, UTF-8 grammar gate, and 33-class byte-bigram Naive Bayes classifier. STRUCTURAL for structural hits; STATISTICAL for NB predictions. @@ -87,7 +87,7 @@ referenced by their SPI name in JSON configuration. the `Metadata` object. Applies WHATWG label normalization (ISO-8859-1 and US-ASCII → windows-1252). In the default chain. -|`naive-bayes-pipeline-encoding-detector` +|`mojibuster-encoding-detector` |`tika-encoding-detector-mojibuster` |Byte-bigram Naive Bayes classifier plus structural detectors for UTF-32 and UTF-16 and a UTF-8 grammar gate. 33 classes including CJK @@ -159,7 +159,7 @@ behaviour), omit CharSoup: {"bom-detector": {}}, {"metadata-charset-detector": {}}, {"html-encoding-detector": {}}, - {"naive-bayes-pipeline-encoding-detector": {}} + {"mojibuster-encoding-detector": {}} ] } ---- @@ -187,7 +187,7 @@ large `<script>` blocks before the meta tag (TIKA-2485): "markLimit": 131072 } }, - {"naive-bayes-pipeline-encoding-detector": {}}, + {"mojibuster-encoding-detector": {}}, {"charsoup-encoding-detector": {}} ] } diff --git a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java index 1f7cf38ba8..21e9ca08a7 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java @@ -38,7 +38,7 @@ import org.apache.tika.parser.ParseContext; * * <p>SPI-loaded first in the default encoding-detector chain so that BOM evidence * reaches {@code CharSoupEncodingDetector} before any statistical detector runs. - * {@code NaiveBayesPipelineEncodingDetector} strips the BOM from its own probe independently + * {@code MojibusterEncodingDetector} strips the BOM from its own probe independently * to ensure consistent model inference (BOMs are excluded from training data).</p> * * @since Apache Tika 0.x (moved to org.apache.tika.detect in 4.0) diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector index 2252b52326..95478a66ba 100644 --- a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector @@ -17,6 +17,6 @@ # It must be loaded last; DefaultEncodingDetector's class-name sort puts # org.apache.tika.parser.* after org.apache.tika.ml.* after # org.apache.tika.detect.*, which guarantees the correct order: -# NaiveBayesPipelineEncodingDetector → +# MojibusterEncodingDetector → # StandardHtmlEncodingDetector → CharSoupEncodingDetector org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/pom.xml b/tika-encoding-detectors/tika-encoding-detector-mojibuster/pom.xml index f57b9d1e9b..059be67f2d 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/pom.xml +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/pom.xml @@ -28,7 +28,7 @@ <artifactId>tika-encoding-detector-mojibuster</artifactId> <name>Apache Tika ML charset encoding detector</name> <description> - ML-based charset encoding detector. NaiveBayesPipelineEncodingDetector + ML-based charset encoding detector. MojibusterEncodingDetector runs structural detectors for UTF-32 / UTF-16 / UTF-8 and then a 33-class byte-bigram Naive Bayes classifier covering CJK multi-byte, EBCDIC, DOS OEM, Cyrillic, Windows single-byte, ISO-8859, and Mac diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java index 5b8f87d622..12d9914292 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java @@ -145,7 +145,7 @@ public final class CharsetConfusables { * in positions the family agrees on — e.g. 0xE4='ä' in every member). * * <p>Used by the Latin-windows-1252 fallback rule in - * {@code NaiveBayesPipelineEncodingDetector}: if the top candidate is a member + * {@code MojibusterEncodingDetector}: if the top candidate is a member * of this set AND the probe decodes byte-identically under windows-1252, * swap to windows-1252 as the unmarked Latin default. This is a * narrower replacement for an earlier general "decode-equivalence diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesPipelineEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java similarity index 98% rename from tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesPipelineEncodingDetector.java rename to tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 64c859525a..d1746b3781 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesPipelineEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -64,8 +64,8 @@ import org.apache.tika.parser.ParseContext; * candidate. Conservative: only return at a layer when that layer's * structural check is clean.</p> */ -@TikaComponent(name = "naive-bayes-pipeline-encoding-detector") -public class NaiveBayesPipelineEncodingDetector implements EncodingDetector { +@TikaComponent(name = "mojibuster-encoding-detector") +public class MojibusterEncodingDetector implements EncodingDetector { /** Default NB bigram model on the classpath. */ public static final String DEFAULT_MODEL_RESOURCE = @@ -132,18 +132,18 @@ public class NaiveBayesPipelineEncodingDetector implements EncodingDetector { * classpath at {@link #DEFAULT_MODEL_RESOURCE}. The UTF-16 * specialist loads its own model the same way. */ - public NaiveBayesPipelineEncodingDetector() throws IOException { + public MojibusterEncodingDetector() throws IOException { this.nb = loadFromClasspath(); this.utf16 = new Utf16SpecialistEncodingDetector(); } - public NaiveBayesPipelineEncodingDetector(Path nbModelPath) throws IOException { + public MojibusterEncodingDetector(Path nbModelPath) throws IOException { this.nb = new NaiveBayesBigramEncodingDetector(nbModelPath); this.utf16 = new Utf16SpecialistEncodingDetector(); } private static NaiveBayesBigramEncodingDetector loadFromClasspath() throws IOException { - InputStream in = NaiveBayesPipelineEncodingDetector.class + InputStream in = MojibusterEncodingDetector.class .getResourceAsStream(DEFAULT_MODEL_RESOURCE); if (in == null) { throw new IOException( diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/Utf16SpecialistEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/Utf16SpecialistEncodingDetector.java index d329e7581c..41cf643659 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/Utf16SpecialistEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/Utf16SpecialistEncodingDetector.java @@ -53,7 +53,7 @@ import org.apache.tika.parser.ParseContext; * * <h3>Stage 1 of the MoE migration</h3> * - * <p>Runs alongside the existing {@code NaiveBayesPipelineEncodingDetector} + * <p>Runs alongside the existing {@code MojibusterEncodingDetector} * rather than replacing any piece of it. Emits a single * {@link EncodingResult.ResultType#STATISTICAL} candidate for CharSoup to * arbitrate against the other detectors in the chain. The existing @@ -258,7 +258,7 @@ public class Utf16SpecialistEncodingDetector /** * Byte-array entry point for callers that already hold a probe - * (e.g. {@code NaiveBayesPipelineEncodingDetector}'s pipeline). Returns an + * (e.g. {@code MojibusterEncodingDetector}'s pipeline). Returns an * empty list for probes below {@link #MIN_PROBE_BYTES} or when the * winning class has margin < {@link #MIN_LOGIT_MARGIN}. */ @@ -315,7 +315,7 @@ public class Utf16SpecialistEncodingDetector /** * Map training-label charset names (e.g. {@code "UTF-16-LE"} with * hyphens) to Java's canonical charset names ({@code "UTF-16LE"} no - * hyphen). Mirrors the mapping in {@code NaiveBayesPipelineEncodingDetector}. + * hyphen). Mirrors the mapping in {@code MojibusterEncodingDetector}. */ private static String toJavaCharsetName(String label) { switch (label) { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/WideUnicodeDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/WideUnicodeDetector.java index 08f9b561c4..25a9c66d12 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/WideUnicodeDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/WideUnicodeDetector.java @@ -20,7 +20,7 @@ import java.nio.charset.Charset; /** * Structural analysis for UTF-32 LE/BE, plus UTF-16 surrogate validity - * flags. This is an internal component of {@code NaiveBayesPipelineEncodingDetector}'s + * flags. This is an internal component of {@code MojibusterEncodingDetector}'s * pipeline — not a standalone {@code EncodingDetector}. Requires upstream * BOM stripping. * @@ -178,7 +178,7 @@ final class WideUnicodeDetector { * {@code offset}. Does not attempt UTF-16 positive detection — that is * the job of {@link Utf16SpecialistEncodingDetector}. Returns only * surrogate-invalidity flags under each endianness, used by - * {@code NaiveBayesPipelineEncodingDetector} to suppress UTF-16 labels from + * {@code MojibusterEncodingDetector} to suppress UTF-16 labels from * the main statistical model on probes that cannot be valid UTF-16. */ private static Result tryUtf16(byte[] bytes, int offset, int length) { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector index c9dd3f2a71..dabb7ab55b 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.tika.ml.chardetect.NaiveBayesPipelineEncodingDetector +org.apache.tika.ml.chardetect.MojibusterEncodingDetector diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java index c482d5ec78..b49dbc1655 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java @@ -39,7 +39,7 @@ import org.apache.tika.parser.ParseContext; * business name) could detect as {@code IBM424} (Hebrew EBCDIC) at * 0.99 confidence — producing complete mojibake. The combination of * structural IBM424 gating, Latin-sibling → windows-1252 fallback in - * {@code NaiveBayesPipelineEncodingDetector}, and CharSoup's + * {@code MojibusterEncodingDetector}, and CharSoup's * language-signal arbitration prevents that. This test exercises * the full detector chain via {@link DefaultEncodingDetector} and * asserts the non-catastrophic property: not IBM424.</p> diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java index bd9701b805..4797f9983c 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java @@ -71,7 +71,7 @@ public class ZipFilenameDetectionTest { ctx.addResult(List.of( new EncodingResult(big5, 0.9f, "Big5-HKSCS", EncodingResult.ResultType.STATISTICAL), new EncodingResult(shiftJis, 0.3f, "Shift_JIS", EncodingResult.ResultType.STATISTICAL) - ), "NaiveBayesPipelineEncodingDetector"); + ), "MojibusterEncodingDetector"); ParseContext parseContext = new ParseContext(); parseContext.set(EncodingDetectorContext.class, ctx); diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BenchmarkCharsetDetectors.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BenchmarkCharsetDetectors.java index 98110d849c..6f604d8867 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BenchmarkCharsetDetectors.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BenchmarkCharsetDetectors.java @@ -31,7 +31,7 @@ import java.util.zip.GZIPInputStream; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.ml.chardetect.NaiveBayesPipelineEncodingDetector; +import org.apache.tika.ml.chardetect.MojibusterEncodingDetector; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.txt.Icu4jEncodingDetector; import org.apache.tika.parser.txt.UniversalEncodingDetector; @@ -88,11 +88,11 @@ public class BenchmarkCharsetDetectors { samples.size(), totalBytes, (double) totalBytes / samples.size()); EncodingDetector[] detectors = { - new NaiveBayesPipelineEncodingDetector(), + new MojibusterEncodingDetector(), new Icu4jEncodingDetector(), new UniversalEncodingDetector(), }; - String[] names = {"NaiveBayesPipelineEncodingDetector", "Icu4jEncodingDetector", "UniversalEncodingDetector"}; + String[] names = {"MojibusterEncodingDetector", "Icu4jEncodingDetector", "UniversalEncodingDetector"}; System.out.printf(Locale.ROOT, "%-28s %8s %10s %10s%n", "Detector", "Rounds", "MB/s", "µs/call"); diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java index e273a2ee8a..bf3efa9fe6 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java @@ -282,7 +282,7 @@ public class BuildCharsetTrainingData { * Charsets whose encoded bytes are all {@literal <} 0x80, so the ML model * would see zero features. Only devtest/test files are generated; train * is skipped. These charsets are detected by structural gates in - * {@code NaiveBayesPipelineEncodingDetector} before the model is ever called. + * {@code MojibusterEncodingDetector} before the model is ever called. */ private static final Set<String> STRUCTURAL_ONLY = new HashSet<>(Arrays.asList( "US-ASCII", "ISO-2022-JP", "ISO-2022-KR", "ISO-2022-CN", "x-ISO-2022-CN-CNS" diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java index a124bbcdb2..1c089f23ea 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java @@ -45,7 +45,7 @@ import org.apache.tika.parser.txt.Icu4jEncodingDetector; import org.apache.tika.parser.txt.UniversalEncodingDetector; /** - * Compares {@code NaiveBayesPipelineEncodingDetector} against ICU4J and juniversalchardet. + * Compares {@code MojibusterEncodingDetector} against ICU4J and juniversalchardet. * * <p>Supports: * <ul> @@ -149,7 +149,7 @@ public class EvalCharsetDetectors { // gate + NB bigram classifier. This is the ship candidate; // compared here against ICU4J and juniversalchardet. EncodingDetector nbDetector = nbModelPath != null - ? new org.apache.tika.ml.chardetect.NaiveBayesPipelineEncodingDetector(nbModelPath) + ? new org.apache.tika.ml.chardetect.MojibusterEncodingDetector(nbModelPath) : new NoopDetector(); EncodingDetector[] detectors = { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index c0088b2a39..4234918e22 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -45,7 +45,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.ml.chardetect.NaiveBayesPipelineEncodingDetector; +import org.apache.tika.ml.chardetect.MojibusterEncodingDetector; import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; @@ -71,7 +71,7 @@ public class TikaEncodingDetectorTest extends TikaTest { .map(Object::getClass).collect(Collectors.toSet()); assertTrue(baseClasses.contains(BOMDetector.class)); assertTrue(baseClasses.contains(MetadataCharsetDetector.class)); - assertTrue(baseClasses.contains(NaiveBayesPipelineEncodingDetector.class)); + assertTrue(baseClasses.contains(MojibusterEncodingDetector.class)); assertTrue(baseClasses.contains(HtmlEncodingDetector.class)); } @@ -95,7 +95,7 @@ public class TikaEncodingDetectorTest extends TikaTest { .map(Object::getClass).collect(Collectors.toSet()); assertTrue(innerClasses.contains(BOMDetector.class)); assertTrue(innerClasses.contains(MetadataCharsetDetector.class)); - assertTrue(innerClasses.contains(NaiveBayesPipelineEncodingDetector.class)); + assertTrue(innerClasses.contains(MojibusterEncodingDetector.class)); assertTrue(detectors1Children.get(3) instanceof MetaEncodingDetector); assertTrue(detectors.get(1) instanceof OverrideEncodingDetector); @@ -124,7 +124,7 @@ public class TikaEncodingDetectorTest extends TikaTest { @Test public void testEncodingDetectorConfigurability() throws Exception { - // CP500 (EBCDIC) is now detected by NaiveBayesPipelineEncodingDetector's structural IBM500 rule. + // CP500 (EBCDIC) is now detected by MojibusterEncodingDetector's structural IBM500 rule. // We must hint Content-Type=text/plain so that TXTParser is selected; without the filename // extension the byte-level MIME sniffer classifies the EBCDIC data as octet-stream. Metadata md = new Metadata(); @@ -219,7 +219,7 @@ public class TikaEncodingDetectorTest extends TikaTest { ((CompositeEncodingDetector) encodingDetector).getDetectors(); // 3 base detectors + 1 MetaEncodingDetector (CharSoup) = 4 total assertEquals(4, children.size(), childParser.getClass().toString()); - assertTrue(children.get(0) instanceof NaiveBayesPipelineEncodingDetector, + assertTrue(children.get(0) instanceof MojibusterEncodingDetector, childParser.getClass().toString()); HtmlEncodingDetector htmlDet = (HtmlEncodingDetector) children.get(1); assertEquals(100000, htmlDet.getDefaultConfig().getMarkLimit(), @@ -290,7 +290,7 @@ public class TikaEncodingDetectorTest extends TikaTest { .map(Object::getClass).collect(Collectors.toSet()); assertTrue(excludedCharSoupClasses.contains(BOMDetector.class)); assertTrue(excludedCharSoupClasses.contains(MetadataCharsetDetector.class)); - assertTrue(excludedCharSoupClasses.contains(NaiveBayesPipelineEncodingDetector.class)); + assertTrue(excludedCharSoupClasses.contains(MojibusterEncodingDetector.class)); assertTrue(excludedCharSoupClasses.contains(HtmlEncodingDetector.class)); for (EncodingDetector d : detectors) { assertNotContained("CharSoup", d.getClass().getSimpleName()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json index a319f8668f..8275da4bfc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json @@ -1,7 +1,7 @@ { "encoding-detectors": [ { - "naive-bayes-pipeline-encoding-detector": {} + "mojibuster-encoding-detector": {} }, { "html-encoding-detector": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 926d28bada..4b34559d18 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -294,7 +294,7 @@ public class HtmlParserTest extends TikaTest { * {@code <html><head><title>\u017d</title></head><body></body></html>} * — 56 bytes of markup with exactly 2 non-ASCII bytes ({@code c5 bd} * = {@code Ž} in UTF-8). After HTML stripping the probe reduces to - * 2 bytes (the title body). {@code NaiveBayesPipelineEncodingDetector} + * 2 bytes (the title body). {@code MojibusterEncodingDetector} * correctly returns UTF-8 as a {@code STRUCTURAL} candidate via the * UTF-8 grammar gate. However {@code CharSoupEncodingDetector} then * arbitrates across all base-detector candidates by language-signal
