This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch refactor-html-encoding-detectors in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9f3e79b770859de51a87bc22ecfd3c10cca328c2 Author: tallison <[email protected]> AuthorDate: Thu Apr 23 20:51:01 2026 -0400 improve legacy charset detector to benefit from features of StandardHtmlEncodingDetector --- .../pages/advanced/charset-detection-design.adoc | 19 +-- .../pages/configuration/encoding-detectors.adoc | 20 +-- .../tika/parser/html/HtmlEncodingDetector.java | 8 +- .../tika/parser/html/TikaHtmlCharsetAliases.java | 172 +++++++++++++++++++++ .../parser/html/charsetdetector/PreScanner.java | 17 -- .../StandardHtmlEncodingDetector.java | 66 ++------ .../tika/config/TikaEncodingDetectorTest.java | 43 +----- ...IKA-2273-exclude-encoding-detector-default.json | 2 +- .../configs/tika-config-html-standalone-bom.json | 9 -- .../tika/parser/html/HtmlEncodingDetectorTest.java | 58 +++++++ .../html/StandardHtmlEncodingDetectorTest.java | 30 +--- 11 files changed, 274 insertions(+), 170 deletions(-) diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc index 64e2bd4c66..2df8226786 100644 --- a/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc +++ b/docs/modules/ROOT/pages/advanced/charset-detection-design.adoc @@ -60,12 +60,13 @@ every detector runs regardless of what the others returned, and the (≤ 50 bytes). | 4 -| `StandardHtmlEncodingDetector` +| `HtmlEncodingDetector` | `tika-encoding-detector-html` -| Scans HTML `<meta charset>` / `<meta http-equiv=Content-Type>` tags. - Returns a DECLARATIVE result. Skips BOM detection by default - (`skipBOM=true`) so that `BOMDetector` owns that signal; set `skipBOM=false` - for standalone use without `BOMDetector`. +| Scans HTML `<meta charset>` / `<meta http-equiv=Content-Type>` tags with a + fast lenient regex matcher. Returns a DECLARATIVE result. Applies a + curated subset of WHATWG label aliases (see <<html-charset-aliases>>). + An alternative, spec-strict implementation — `StandardHtmlEncodingDetector` + — is available opt-in for users who need the full WHATWG prescan algorithm. | 5 | `CharSoupEncodingDetector` @@ -503,10 +504,10 @@ Reads the first 4 bytes and detects: | `FE FF` | UTF-16-BE |=== -Returns a DECLARATIVE result. `StandardHtmlEncodingDetector` skips BOM -detection by default (`skipBOM=true`) so that `BOMDetector` is the sole source -of BOM evidence. This separation allows `CharSoupEncodingDetector` to -arbitrate when a BOM and a `<meta charset>` tag disagree. +Returns a DECLARATIVE result. The HTML detectors do not handle BOMs on their +own: `BOMDetector` is the sole source of BOM evidence, which lets +`CharSoupEncodingDetector` arbitrate when a BOM and a `<meta charset>` tag +disagree. == Performance and accuracy diff --git a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc index 25999b3acd..80105a77ed 100644 --- a/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc +++ b/docs/modules/ROOT/pages/configuration/encoding-detectors.adoc @@ -40,9 +40,9 @@ The default chain when `tika-charset-detectors-core` is on the classpath: |A UTF-8, UTF-16 LE/BE, or UTF-32 LE/BE byte-order mark is present. |3 -|`standard-html-encoding-detector` +|`html-encoding-detector` |An HTML `<meta charset="…">` or `Content-Type` http-equiv tag is found -(WHATWG spec prescan algorithm). +(fast lenient regex matcher, curated WHATWG label aliases). |4 |`ml-encoding-detector` @@ -100,9 +100,10 @@ referenced by name in JSON configuration. |`tika-charset-detectors-core` |Byte-order mark detection (UTF-8/16/32). In the default chain. -|`standard-html-encoding-detector` +|`html-encoding-detector` |`tika-charset-detectors-core` -|WHATWG-spec HTML charset prescan. In the default chain. +|Fast lenient regex matcher for `<meta charset>` / `http-equiv` tags, with a +curated subset of WHATWG label aliases. In the default chain. |`ml-encoding-detector` |`tika-charset-detectors-core` @@ -114,10 +115,11 @@ In the default chain. |State-machine structural prober; wraps the `com.github.albfernandez:juniversalchardet` fork. Auto-registers when the module jar is on the classpath. -|`html-encoding-detector` +|`standard-html-encoding-detector` |`tika-charset-detectors-core` -|Older regex-based HTML meta-charset detector. Not in the default chain -(use `standard-html-encoding-detector` instead). +|Spec-strict WHATWG prescan algorithm. Not in the default chain — opt in +explicitly if you need strict WHATWG tokenisation (e.g. ignoring charset +declarations inside comments or other contexts the lenient regex may match). |`icu4j-encoding-detector` |`tika-charset-detectors-icu4j` @@ -159,7 +161,7 @@ statistical chain: "encoding-detectors": [ {"http-header-encoding-detector": {}}, {"bom-encoding-detector": {}}, - {"standard-html-encoding-detector": {}}, + {"html-encoding-detector": {}}, {"ml-encoding-detector": {}} ] } @@ -177,7 +179,7 @@ large `<script>` blocks before the `<meta charset>` declaration. {"http-header-encoding-detector": {}}, {"bom-encoding-detector": {}}, { - "standard-html-encoding-detector": { + "html-encoding-detector": { "markLimit": 65536 } }, diff --git a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java index c2fe6dac76..c052b062b6 100644 --- a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java @@ -50,7 +50,7 @@ import org.apache.tika.utils.CharsetUtils; * * @since Apache Tika 1.2 */ -@TikaComponent(spi = false) +@TikaComponent(name = "html-encoding-detector") public class HtmlEncodingDetector implements EncodingDetector { // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K) @@ -191,10 +191,10 @@ public class HtmlEncodingDetector implements EncodingDetector { if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { continue; } - if ("x-user-defined".equalsIgnoreCase(candCharset)) { - candCharset = "windows-1252"; + Charset aliased = TikaHtmlCharsetAliases.resolve(candCharset); + if (aliased != null) { + return aliased; } - if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); diff --git a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/TikaHtmlCharsetAliases.java b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/TikaHtmlCharsetAliases.java new file mode 100644 index 0000000000..3bcfff140f --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/TikaHtmlCharsetAliases.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +/** + * Curated subset of the WHATWG Encoding Standard label table + * (https://encoding.spec.whatwg.org/) for use by {@link HtmlEncodingDetector}. + * + * <p>The WHATWG table is designed for <em>web browsers</em> where lenient + * decoding with fallbacks is preferable to failing or producing mojibake. + * For a generic content-extraction library the same policy can be + * data-destructive, so this class intentionally departs from the spec in + * three places: + * + * <ol> + * <li><strong>No replacement charset for ISO-2022-KR / ISO-2022-CN / + * HZ-GB-2312.</strong> WHATWG maps these to a dummy "replacement" + * decoder that emits {@code U+FFFD} for every byte. For security in a + * browser this is fine; for Tika it would throw away legitimate text + * in those encodings, so we simply leave the labels unmapped and let + * the downstream detector chain (Mojibuster's structural rules, etc.) + * handle them.</li> + * <li><strong>No ISO-8859-14 / 16 / 10 downgrades.</strong> WHATWG + * collapses these into ISO-8859-1 / ISO-8859-4 because no major + * browser implements them. Java's JDK does, so we let the label + * resolve to the native charset via {@link Charset#forName}.</li> + * <li><strong>{@code windows-949} / {@code MS949} / {@code CP949} → + * {@code x-windows-949} (not {@code EUC-KR}).</strong> Unified Hangul + * Code is a strict superset of EUC-KR — resolving these labels to + * EUC-KR emits {@code U+FFFD} on extension bytes that MS949 decodes + * correctly.</li> + * </ol> + * + * <p>All other WHATWG labels we recognise — including browser-friendly + * aliases like {@code iso-8859-1} → {@code windows-1252}, {@code iso-8859-9} + * → {@code windows-1254}, {@code tis-620} → {@code windows-874}, and the + * naked {@code utf-16} → {@code UTF-16LE} BOM-absent default — match the + * spec exactly. + */ +final class TikaHtmlCharsetAliases { + + private static final Map<String, Charset> CHARSETS_BY_LABEL = buildTable(); + + private TikaHtmlCharsetAliases() { + } + + /** + * @param label a charset label from an HTML {@code <meta charset>} or + * {@code Content-Type} attribute + * @return the Java charset this label resolves to, or {@code null} if the + * label is not in the curated alias table (callers should then + * fall back to {@link Charset#forName} with a supported-by-IANA + * check) + */ + static Charset resolve(String label) { + if (label == null) { + return null; + } + return CHARSETS_BY_LABEL.get(label.trim().toLowerCase(Locale.US)); + } + + private static Map<String, Charset> buildTable() { + Map<String, Charset> m = new HashMap<>(); + add(m, charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"); + add(m, charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"); + add(m, charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", + "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601"); + // windows-949 / MS949 / CP949 are supersets of EUC-KR; route to x-windows-949 + // to preserve MS949 extension syllables (see class javadoc). + add(m, charset("x-windows-949"), "windows-949", "ms949", "cp949"); + add(m, charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", + "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"); + add(m, charset("IBM866"), "866", "cp866", "csibm866", "ibm866"); + add(m, charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp"); + add(m, charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913"); + add(m, charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915", + "iso_8859-15", "l9"); + add(m, charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2", + "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2"); + add(m, charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3", + "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3"); + add(m, charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4", + "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4"); + add(m, charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5", + "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988"); + add(m, charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i", + "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", + "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987"); + add(m, charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek", + "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", + "iso_8859-7:1987", "sun_eu_greek"); + // ISO-8859-8 (visual order) and ISO-8859-8-I (logical order): + // we do not implement directionality remapping, so both resolve to ISO-8859-8 + // where available. + add(m, charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8", + "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", + "iso_8859-8:1988", "visual"); + add(m, charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical"); + add(m, charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"); + add(m, charset("KOI8-U"), "koi8-ru", "koi8-u"); + add(m, charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis", + "shift_jis", "sjis", "windows-31j", "x-sjis"); + add(m, charset("UTF-16BE"), "utf-16be"); + // Naked "utf-16" with no BOM defaults to UTF-16LE per WHATWG. + add(m, charset("UTF-16LE"), "utf-16", "utf-16le"); + add(m, charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8"); + add(m, charset("gb18030"), "gb18030"); + add(m, charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250"); + add(m, charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251"); + add(m, charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", + "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", + "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252", + "x-cp1252"); + add(m, charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253"); + add(m, charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", + "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", + "windows-1254", "x-cp1254"); + add(m, charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255"); + add(m, charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256"); + add(m, charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257"); + add(m, charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258"); + add(m, charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911", + "tis-620", "windows-874"); + add(m, charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian"); + add(m, charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman"); + // x-user-defined is a browser-only passthrough; resolve to windows-1252, + // which mirrors HtmlEncodingDetector's pre-existing behaviour. + add(m, charset("windows-1252"), "x-user-defined"); + return m; + } + + private static Charset charset(String... names) { + for (String name : names) { + try { + return Charset.forName(name); + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + // try next alternative + } + } + return null; + } + + private static void add(Map<String, Charset> m, Charset cs, String... labels) { + if (cs == null) { + return; + } + for (String label : labels) { + m.put(label, cs); + } + } +} diff --git a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java index c1b69ab2b5..1091812da8 100644 --- a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java +++ b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java @@ -20,7 +20,6 @@ import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; import java.util.AbstractMap; import java.util.BitSet; import java.util.Map; @@ -50,9 +49,6 @@ class PreScanner { private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH); private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?'); - private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; - private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF}; - private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE}; private static final byte LOWER_A = (byte) 'a'; private static final byte LOWER_Z = (byte) 'z'; private static final byte UPPER_A = (byte) 'A'; @@ -97,19 +93,6 @@ class PreScanner { return null; } - Charset detectBOM() { - try { - if (expect(UTF8_BOM)) { - return StandardCharsets.UTF_8; - } else if (expect(UTF16_BE_BOM)) { - return StandardCharsets.UTF_16BE; - } else if (expect(UTF16_LE_BOM)) { - return StandardCharsets.UTF_16LE; - } - } catch (IOException e) { /* stream could not be read, also return null */ } - return null; - } - private boolean processAtLeastOneByte() { try { return processComment() || processMeta() || processTag() || processSpecialTag() || diff --git a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java index b3678306b6..b100bb8fcf 100644 --- a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java @@ -33,25 +33,18 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; /** - * An encoding detector that respects the HTML5 encoding-sniff algorithm - * (https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream): - * BOM → HTTP Content-Type header → {@code <meta charset>} / {@code <meta http-equiv>} tag. + * Full WHATWG prescan charset detector for HTML: HTTP Content-Type header → + * {@code <meta charset>} / {@code <meta http-equiv>} tag, per + * https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream. * - * <p>By default, BOM detection is skipped ({@code skipBOM=true}) because - * {@code BOMDetector} handles that as a separate step in the chain, producing its own - * DECLARATIVE result that {@code CharSoupEncodingDetector} can arbitrate against a - * contradicting {@code <meta charset>} declaration. + * <p>BOM detection is <em>not</em> performed here; {@code BOMDetector} handles + * that as a separate, earlier step in the detector chain. * - * <p>When used standalone (outside a {@link org.apache.tika.detect.CompositeEncodingDetector} - * chain without {@code BOMDetector}), set {@code skipBOM=false} to get the full HTML5 - * spec algorithm including BOM detection. - * - * <p>HTTP/MIME Content-Type and Content-Encoding metadata are always read here for - * standalone compatibility; in the chain they will already have been returned by - * {@code MetadataCharsetDetector} and {@code CharSoup} will handle the duplication - * gracefully (identical DECLARATIVE results agree, so no harm done). + * <p>Opt-in: register explicitly in a {@code <encodingDetectors>} config to use + * this detector in place of the lenient {@link org.apache.tika.parser.html.HtmlEncodingDetector} + * default. */ -@TikaComponent(name = "standard-html-encoding-detector") +@TikaComponent(name = "standard-html-encoding-detector", spi = false) public final class StandardHtmlEncodingDetector implements EncodingDetector { /** * Default number of bytes to scan for a {@code <meta charset>} declaration. @@ -64,21 +57,6 @@ public final class StandardHtmlEncodingDetector implements EncodingDetector { private int markLimit = META_TAG_BUFFER_SIZE; - /** - * When {@code true}, the BOM check is skipped and the detector goes directly to - * the Content-Type header and {@code <meta>} scan. This is the default because - * {@code BOMDetector} handles BOM detection as a separate step in the chain, - * allowing {@code CharSoupEncodingDetector} to arbitrate between a BOM declaration - * and a contradicting {@code <meta charset>} rather than having the BOM silently - * short-circuit the meta-tag scan. - * - * <p>Set to {@code false} only when using this detector standalone (without - * {@code BOMDetector} in the chain) to get full HTML5 spec-compliant behaviour.</p> - * - * <p>Default: {@code true}.</p> - */ - private boolean skipBOM = true; - @Override public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata, ParseContext context) throws IOException { @@ -88,17 +66,7 @@ public final class StandardHtmlEncodingDetector implements EncodingDetector { .setInputStream(tis).setMaxCount(limit).get(); PreScanner preScanner = new PreScanner(limitedStream); - Charset detectedCharset = null; - - if (!skipBOM) { - // HTML5 spec: BOM overrides everything. When used standalone this - // detector is responsible for BOM detection; when used in the chain with - // BOMDetector, setting skipBOM=true lets CharSoup arbitrate. - detectedCharset = preScanner.detectBOM(); - } - if (detectedCharset == null) { - detectedCharset = MetadataCharsetDetector.charsetFromContentType(metadata); - } + Charset detectedCharset = MetadataCharsetDetector.charsetFromContentType(metadata); if (detectedCharset == null) { detectedCharset = MetadataCharsetDetector.charsetFromContentEncoding(metadata); } @@ -125,18 +93,4 @@ public final class StandardHtmlEncodingDetector implements EncodingDetector { public void setMarkLimit(int markLimit) { this.markLimit = markLimit; } - - public boolean isSkipBOM() { - return skipBOM; - } - - /** - * When {@code true}, skip the BOM check and rely on {@code BOMDetector} in the - * chain. This allows {@code CharSoupEncodingDetector} to arbitrate between a - * BOM and a contradicting {@code <meta charset>} declaration. - * Default is {@code true}. - */ - public void setSkipBOM(boolean skipBOM) { - this.skipBOM = skipBOM; - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index 4a10804d39..4234918e22 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -62,7 +62,7 @@ public class TikaEncodingDetectorTest extends TikaTest { EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors(); assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - // 4 base detectors (BOM, Metadata, ML, StandardHtml) + CharSoupEncodingDetector (MetaEncodingDetector) + // 4 base detectors (BOM, Metadata, ML, HtmlEncodingDetector) + CharSoupEncodingDetector (MetaEncodingDetector) assertEquals(5, detectors.size()); // meta detector is always last (partitioned by CompositeEncodingDetector) assertTrue(detectors.get(4) instanceof MetaEncodingDetector); @@ -72,7 +72,7 @@ public class TikaEncodingDetectorTest extends TikaTest { assertTrue(baseClasses.contains(BOMDetector.class)); assertTrue(baseClasses.contains(MetadataCharsetDetector.class)); assertTrue(baseClasses.contains(MojibusterEncodingDetector.class)); - assertTrue(baseClasses.contains(StandardHtmlEncodingDetector.class)); + assertTrue(baseClasses.contains(HtmlEncodingDetector.class)); } @Test @@ -284,14 +284,14 @@ public class TikaEncodingDetectorTest extends TikaTest { assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - // 4 base detectors (BOM + Metadata + ML + StandardHtml), no MetaEncodingDetector + // 4 base detectors (BOM + Metadata + ML + HtmlEncodingDetector), no MetaEncodingDetector assertEquals(4, detectors.size()); Set<Class<?>> excludedCharSoupClasses = detectors.stream() .map(Object::getClass).collect(Collectors.toSet()); assertTrue(excludedCharSoupClasses.contains(BOMDetector.class)); assertTrue(excludedCharSoupClasses.contains(MetadataCharsetDetector.class)); assertTrue(excludedCharSoupClasses.contains(MojibusterEncodingDetector.class)); - assertTrue(excludedCharSoupClasses.contains(StandardHtmlEncodingDetector.class)); + assertTrue(excludedCharSoupClasses.contains(HtmlEncodingDetector.class)); for (EncodingDetector d : detectors) { assertNotContained("CharSoup", d.getClass().getSimpleName()); } @@ -341,41 +341,6 @@ public class TikaEncodingDetectorTest extends TikaTest { assertContains("\u0627\u0644\u0639\u0631\u0628\u064a\u0629", result.xml); } - /** - * Demonstrates loading a config file that sets {@code skipBOM=false} on - * {@code StandardHtmlEncodingDetector} for standalone use (without - * {@code BOMDetector} in the chain). With this config, the HTML detector - * handles BOM detection itself, so BOM takes precedence over - * {@code <meta charset>} per the HTML5 spec. - * - * @see configs/tika-config-html-standalone-bom.json - */ - @Test - public void testStandaloneHtmlBomConfig() throws Exception { - TikaLoader tikaLoader = - TikaLoaderHelper.getLoader("tika-config-html-standalone-bom.json"); - EncodingDetector detector = tikaLoader.loadEncodingDetectors(); - assertTrue(detector instanceof CompositeEncodingDetector); - List<EncodingDetector> detectors = - ((CompositeEncodingDetector) detector).getDetectors(); - assertEquals(1, detectors.size()); - assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector); - assertFalse(((StandardHtmlEncodingDetector) detectors.get(0)).isSkipBOM(), - "skipBOM should be false for standalone use"); - - // BOM-prefixed HTML with a contradicting <meta charset>: - // with skipBOM=false the BOM wins (HTML5 spec behaviour). - byte[] html = "\ufeff<meta charset='WINDOWS-1252'>" - .getBytes(StandardCharsets.UTF_8); - try (TikaInputStream tis = TikaInputStream.get(html)) { - List<EncodingResult> results = - detector.detect(tis, new Metadata(), new ParseContext()); - assertFalse(results.isEmpty()); - assertEquals(StandardCharsets.UTF_8, results.get(0).getCharset(), - "standalone skipBOM=false: BOM should override <meta charset>"); - } - } - private void findEncodingDetectionParsers(Parser p, List<Parser> encodingDetectionParsers) { if (p instanceof CompositeParser) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json index 78a813b580..240924a28c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json @@ -4,7 +4,7 @@ { "default-encoding-detector": { "exclude": [ - "standard-html-encoding-detector" + "html-encoding-detector" ] } }, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-html-standalone-bom.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-html-standalone-bom.json deleted file mode 100644 index 775fca3b90..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/tika-config-html-standalone-bom.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "encoding-detectors": [ - { - "standard-html-encoding-detector": { - "skipBOM": false - } - } - ] -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java index 6de0376f73..a0c8bb5e90 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java @@ -54,6 +54,64 @@ public class HtmlEncodingDetectorTest { assertWindows1252("<meta charset='x-user-defined'>"); } + @Test + public void iso88591IsWindows1252() throws IOException { + // WHATWG: iso-8859-1 is an alias for windows-1252. + assertWindows1252("<meta charset='iso-8859-1'>"); + } + + @Test + public void usAsciiIsWindows1252() throws IOException { + assertWindows1252("<meta charset='us-ascii'>"); + } + + @Test + public void iso88599IsWindows1254() throws IOException { + assertCharset("<meta charset='iso-8859-9'>", Charset.forName("windows-1254")); + } + + @Test + public void tis620IsWindows874() throws IOException { + assertCharset("<meta charset='tis-620'>", Charset.forName("windows-874")); + } + + @Test + public void gb2312IsGbk() throws IOException { + assertCharset("<meta charset='gb2312'>", Charset.forName("GBK")); + } + + @Test + public void ms932IsShiftJis() throws IOException { + assertCharset("<meta charset='ms932'>", Charset.forName("Shift_JIS")); + } + + @Test + public void ms949IsXWindows949() throws IOException { + // Tika convention (differs from WHATWG which downgrades to EUC-KR): + // route MS949 labels to x-windows-949 to preserve extension bytes. + assertCharset("<meta charset='ms949'>", Charset.forName("x-windows-949")); + assertCharset("<meta charset='windows-949'>", Charset.forName("x-windows-949")); + } + + @Test + public void nakedUtf16IsUtf16Le() throws IOException { + // WHATWG: naked 'utf-16' (no BOM) defaults to UTF-16LE. + assertCharset("<meta charset='utf-16'>", StandardCharsets.UTF_16LE); + } + + @Test + public void hebrewLabelIsIso88598() throws IOException { + assertCharset("<meta charset='hebrew'>", Charset.forName("ISO-8859-8")); + } + + @Test + public void iso2022KrIsNotReplaced() throws IOException { + // WHATWG replaces iso-2022-kr with a dummy "replacement" decoder; + // Tika keeps the real ISO-2022-KR charset because we want to extract + // text, not block attacks. + assertCharset("<meta charset='iso-2022-kr'>", Charset.forName("ISO-2022-KR")); + } + @Test public void withSlash() throws IOException { assertWindows1252("<meta/charset='WINDOWS-1252'>"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java index 9844fb137b..16d7f8359d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java @@ -190,21 +190,10 @@ public class StandardHtmlEncodingDetectorTest { } @Test - public void bomStandalone() throws IOException { - // When used standalone (skipBOM=false), BOM has precedence over meta per HTML5 spec. - // In the default chain, BOMDetector handles BOM separately so this detector - // skips it (skipBOM=true) and focuses on <meta charset>. - // See TikaEncodingDetectorTest.testStandaloneHtmlBomConfig() for a config-file example. - StandardHtmlEncodingDetector standalone = standaloneDetector(); - assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8, standalone); - assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE, standalone); - assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE, standalone); - } - - @Test - public void bomSkippedByDefault() throws IOException { - // With default skipBOM=true, BOM is ignored and meta tag wins. - // BOMDetector in the chain handles BOM as a separate context entry. + public void bomIgnoredMetaWins() throws IOException { + // This detector no longer handles BOMs; BOMDetector is a separate detector + // in the chain. If the stream happens to start with BOM bytes, the prescan + // still finds <meta charset>. assertCharset("\ufeff<meta charset='WINDOWS-1252'>", Charset.forName("WINDOWS-1252")); } @@ -306,11 +295,6 @@ public class StandardHtmlEncodingDetectorTest { assertWindows1252(""); assertWindows1252("<meta charset='UTF-8'>"); assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>"); - // With skipBOM=false (standalone), BOM has precedence over transport layer info - StandardHtmlEncodingDetector standalone = standaloneDetector(); - assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8, standalone); - assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE, standalone); - assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE, standalone); } @Test @@ -349,12 +333,6 @@ public class StandardHtmlEncodingDetectorTest { assertArrayEquals(inBytes, outBytes); } - private static StandardHtmlEncodingDetector standaloneDetector() { - StandardHtmlEncodingDetector d = new StandardHtmlEncodingDetector(); - d.setSkipBOM(false); - return d; - } - private void assertWindows1252(String html) throws IOException { assertCharset(html, Charset.forName("WINDOWS-1252")); }
