This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6c78186619 TIKA-4683-rollback-encoding-detection (#2796)
6c78186619 is described below
commit 6c781866192014a1866d8edc9f4465ccd6c90bf0
Author: Tim Allison <[email protected]>
AuthorDate: Thu Apr 30 20:41:30 2026 -0400
TIKA-4683-rollback-encoding-detection (#2796)
---
.../org/apache/tika/detect/AutoDetectReader.java | 46 +++---
.../java/org/apache/tika/detect/BOMDetector.java | 2 +-
.../tika/detect/DefaultEncodingDetector.java | 44 ++++--
.../tika/detect/MetadataCharsetDetector.java | 2 +-
.../org.apache.tika.detect.EncodingDetector | 11 +-
.../org.apache.tika.detect.EncodingDetector | 2 +-
.../org.apache.tika.detect.EncodingDetector | 3 +-
.../ml/chardetect/MojibusterEncodingDetector.java | 2 +-
.../org.apache.tika.detect.EncodingDetector | 4 +-
.../chardetect/SparseLatinVcardRegressionTest.java | 2 +
.../ml/chardetect/ZipFilenameDetectionTest.java | 7 +-
.../org.apache.tika.detect.EncodingDetector | 3 +-
.../pipes/opensearch/tests/OpenSearchTest.java | 3 +-
.../tika/pipes/s3/tests/S3PipeIntegrationTest.java | 5 +-
.../ml/junkdetect/JunkFilterEncodingDetector.java | 2 +-
.../tika/config/TikaEncodingDetectorTest.java | 49 +++---
.../apache/tika/parser/AutoDetectParserTest.java | 4 +-
.../parser/html/HtmlEncodingDetectionTest.java | 2 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 2 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
.../apache/tika/parser/pkg/PackageParserTest.java | 3 -
.../tika-parser-html-module/pom.xml | 12 ++
.../apache/tika/parser/html/HtmlParserTest.java | 164 +++++++++++++++++----
.../tika-parser-mail-module/pom.xml | 12 ++
.../tika-parser-microsoft-module/pom.xml | 6 +
.../microsoft/POIContainerExtractionTest.java | 3 +-
.../java/org/apache/tika/parser/pkg/ZipParser.java | 24 ++-
.../tika/parser/csv/TextAndCSVParserTest.java | 12 +-
.../org/apache/tika/parser/txt/TXTParserTest.java | 86 +++++------
29 files changed, 337 insertions(+), 182 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index 9e6c23297f..a86eb5c2bd 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import org.xml.sax.InputSource;
@@ -30,9 +31,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.CharsetUtils;
/**
* An input stream reader that automatically detects the character encoding
@@ -108,24 +107,35 @@ public class AutoDetectReader extends BufferedReader {
return detected;
}
- // Try determining the encoding based on hints in document metadata
- MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
- if (type != null) {
- String charsetParam = type.getParameters().get("charset");
- if (charsetParam != null) {
- try {
- Charset cs = CharsetUtils.forName(charsetParam);
- metadata.set(TikaCoreProperties.DETECTED_ENCODING,
cs.name());
- metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
- "AutoDetectReader-charset-metadata-fallback");
- return cs;
- } catch (IllegalArgumentException e) {
- // ignore
- }
- }
+ // Try determining the encoding based on hints in document metadata.
+ // Two metadata keys are honoured (TIKA-4683 — restoring 3.x
parser-layer
+ // behaviour that consulted both): the charset parameter of
CONTENT_TYPE
+ // (e.g. "text/html; charset=UTF-8") and a bare charset label in
+ // CONTENT_ENCODING (set by parsers such as RFC822Parser).
+ Charset metaCharset =
MetadataCharsetDetector.charsetFromContentType(metadata);
+ if (metaCharset == null) {
+ metaCharset =
MetadataCharsetDetector.charsetFromContentEncoding(metadata);
+ }
+ if (metaCharset != null) {
+ metadata.set(TikaCoreProperties.DETECTED_ENCODING,
metaCharset.name());
+ metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ "AutoDetectReader-charset-metadata-fallback");
+ return metaCharset;
}
- throw new TikaException("Failed to detect the character encoding of a
document");
+ // Final fallback (TIKA-4683): when the rolled-back 3.x-style chain
+ // (Html, Universal, Icu4j) abstains on short/pure-ASCII inputs and
+ // metadata carries no charset hint, default to ISO-8859-1 rather
+ // than throwing. This matches 3.x's default-charset behaviour:
+ // pre-TIKA-4685 the chain effectively returned ISO-8859-1 for
+ // ASCII-only content, and tests assert that. 4.x's TIKA-4685
+ // refactor moved to windows-1252 via WHATWG normalisation; we
+ // explicitly opt out of that here.
+ Charset fallback = StandardCharsets.ISO_8859_1;
+ metadata.set(TikaCoreProperties.DETECTED_ENCODING, fallback.name());
+ metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ "AutoDetectReader-default-fallback");
+ return fallback;
}
private static TikaInputStream getTikaInputStream(InputStream stream) {
diff --git a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
index 61c40ab672..db1a916384 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
@@ -44,7 +44,7 @@ import org.apache.tika.parser.ParseContext;
*
* @since Apache Tika 0.x (moved to org.apache.tika.detect in 4.0)
*/
-@TikaComponent
+@TikaComponent(spi = false)
public class BOMDetector implements EncodingDetector {
private static final ByteOrderMark[] BOMS =
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
index 0d131ce0d2..932f2f05cd 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
@@ -18,7 +18,9 @@ package org.apache.tika.detect;
import java.util.Collection;
import java.util.Comparator;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import javax.imageio.spi.ServiceRegistry;
import org.apache.tika.config.ServiceLoader;
@@ -28,29 +30,37 @@ import org.apache.tika.config.ServiceLoader;
* implementations available through the
* {@link ServiceRegistry service provider mechanism}.
*
- * <p>Loaded detectors are sorted in two tiers:
+ * <p>The default chain (Tika 3.x style) runs three detectors in order, with
+ * the first non-empty result winning:
* <ol>
- * <li>Base detectors (non-{@link MetaEncodingDetector}) sorted by full
- * class name (non-Tika before Tika, then ascending alphabetically).
- * The package ordering guarantees:
- * {@code org.apache.tika.ml.*} (Mojibuster) →
- * {@code org.apache.tika.parser.*} (HTML).</li>
- * <li>{@link MetaEncodingDetector} instances always run last, after all
- * base detectors have collected their candidates into
- * {@link EncodingDetectorContext}.</li>
- * </ol></p>
+ * <li>{@code org.apache.tika.parser.html.HtmlEncodingDetector}</li>
+ * <li>{@code org.apache.tika.parser.txt.UniversalEncodingDetector}</li>
+ * <li>{@code org.apache.tika.parser.txt.Icu4jEncodingDetector}</li>
+ * </ol>
+ * Any other {@link EncodingDetector} discovered via SPI (e.g.,
+ * user-supplied detectors) runs after the three blessed detectors,
+ * preserving back-compat for callers who add their own.</p>
*
* <p>If you need to control the order of the Detectors explicitly, construct
* your own {@link CompositeEncodingDetector} and pass in the list in the
* required order.</p>
*
- * <p>{@link MetaEncodingDetector} handling (collect-all-then-arbitrate)
- * is provided by {@link CompositeEncodingDetector}.</p>
- *
* @since Apache Tika 1.15
*/
public class DefaultEncodingDetector extends CompositeEncodingDetector {
+ /** Pinned ordering for the 3.x-style default chain. Detectors not on this
+ * map keep their natural SPI load order behind the three blessed ones. */
+ private static final Map<String, Integer> PRIORITY = buildPriority();
+
+ private static Map<String, Integer> buildPriority() {
+ Map<String, Integer> p = new HashMap<>();
+ p.put("org.apache.tika.parser.html.HtmlEncodingDetector", 0);
+ p.put("org.apache.tika.parser.txt.UniversalEncodingDetector", 1);
+ p.put("org.apache.tika.parser.txt.Icu4jEncodingDetector", 2);
+ return p;
+ }
+
public DefaultEncodingDetector() {
this(new
ServiceLoader(DefaultEncodingDetector.class.getClassLoader()));
}
@@ -67,11 +77,13 @@ public class DefaultEncodingDetector extends
CompositeEncodingDetector {
}
private static List<EncodingDetector> sorted(List<EncodingDetector>
detectors) {
- // Two-key sort: base detectors first (meta=0) then
MetaEncodingDetectors (meta=1),
- // within each tier sorted by full class name for stability across
JARs.
+ // Pin the 3.x default chain (html, universal, icu4j) to fixed
+ // positions; other detectors fall to the end with stable secondary
+ // ordering by class name.
detectors.sort(Comparator
.<EncodingDetector, Integer>comparing(
- d -> (d instanceof MetaEncodingDetector) ? 1 : 0)
+ d -> PRIORITY.getOrDefault(
+ d.getClass().getName(), Integer.MAX_VALUE))
.thenComparing(d -> d.getClass().getName()));
return detectors;
}
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
index f3c4b01ac3..13102ea01d 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
@@ -49,7 +49,7 @@ import org.apache.tika.parser.ParseContext;
*
* @since Apache Tika 4.0
*/
-@TikaComponent(name = "metadata-charset-detector")
+@TikaComponent(spi = false, name = "metadata-charset-detector")
public class MetadataCharsetDetector implements EncodingDetector {
@Override
diff --git
a/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
b/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
index 9d655fe140..2970322e6e 100644
---
a/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
+++
b/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -13,10 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# org.apache.tika.detect.* sorts before other detector namespaces so BOM and
-# HTTP/MIME DECLARATIVE evidence reaches JunkFilterEncodingDetector before any
-# statistical detector runs. Class-name order: BOMDetector first, then
-# MetadataCharsetDetector.
-org.apache.tika.detect.BOMDetector
-org.apache.tika.detect.MetadataCharsetDetector
+# Intentionally empty: tika-core itself does not register any default
+# EncodingDetector implementations. The default chain is provided by the
+# tika-encoding-detector-html, tika-encoding-detector-universal, and
+# tika-encoding-detector-icu4j modules and is sequenced by
+# DefaultEncodingDetector.
diff --git
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
index 259f02d36a..068b5edd9c 100644
---
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
+++
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector
+org.apache.tika.parser.html.HtmlEncodingDetector
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
similarity index 93%
copy from
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
copy to
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
index dabb7ab55b..6283ea152d 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
+++
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -12,5 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-org.apache.tika.ml.chardetect.MojibusterEncodingDetector
+org.apache.tika.parser.txt.Icu4jEncodingDetector
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index ca6b71c0ff..d540bf2e22 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -66,7 +66,7 @@ import org.apache.tika.parser.ParseContext;
* candidate. Conservative: only return at a layer when that layer's
* structural check is clean.</p>
*/
-@TikaComponent(name = "mojibuster-encoding-detector")
+@TikaComponent(spi = false, name = "mojibuster-encoding-detector")
public class MojibusterEncodingDetector implements EncodingDetector {
private static final Logger LOG =
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
index dabb7ab55b..22e3b25428 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -13,4 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.tika.ml.chardetect.MojibusterEncodingDetector
+# Intentionally empty: MojibusterEncodingDetector is no longer part of the
+# default Tika encoding-detection chain. Users who want it must register it
+# explicitly via tika-config.
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
index b49dbc1655..660596ed65 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
@@ -22,6 +22,7 @@ import static
org.junit.jupiter.api.Assertions.assertNotEquals;
import java.nio.charset.StandardCharsets;
import java.util.List;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.detect.DefaultEncodingDetector;
@@ -53,6 +54,7 @@ public class SparseLatinVcardRegressionTest {
* (windows-1257, IBM852, etc.) is a documented sibling-arbitration
* limitation; only the catastrophic case is asserted here.
*/
+ @Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j);
Mojibuster no longer in default chain.")
@Test
public void sparseLatinVcardDoesNotDetectAsIbm424() throws Exception {
byte[] probe = buildSparseLatinVcard();
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
index ff098badb7..17a84dd9a8 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
+++
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
@@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.List;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.detect.DefaultEncodingDetector;
@@ -58,6 +59,7 @@ public class ZipFilenameDetectionTest {
* sequentially on two entries differing only in byte 5 (0x31 vs 0x32),
simulating
* what ZipParser does when iterating entries with the same ParseContext.
*/
+ @Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j);
Mojibuster no longer in default chain.")
@Test
public void fullPipelineDetectsBothSjisEntries() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
@@ -77,11 +79,8 @@ public class ZipFilenameDetectionTest {
/**
* Full pipeline should detect GBK-encoded entry names as GB18030.
- * Disabled: CharSoup's discriminative language model picks KOI8-U over
GB18030
- * on short probes because the GBK bytes happen to score as Cyrillic.
- * Re-enable once generative language models are in place (better
calibrated
- * confidence will let CharSoup correctly abstain on cross-script
ambiguity).
*/
+ @Disabled("TIKA-4683: rolled-back chain (Html, Universal, Icu4j);
Mojibuster no longer in default chain.")
@Test
public void fullPipelineDetectsGbkEntry() throws Exception {
DefaultEncodingDetector detector = new DefaultEncodingDetector();
diff --git
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
b/tika-encoding-detectors/tika-encoding-detector-universal/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
similarity index 93%
copy from
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
copy to
tika-encoding-detectors/tika-encoding-detector-universal/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
index dabb7ab55b..2982e2584e 100644
---
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
+++
b/tika-encoding-detectors/tika-encoding-detector-universal/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -12,5 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-org.apache.tika.ml.chardetect.MojibusterEncodingDetector
+org.apache.tika.parser.txt.UniversalEncodingDetector
diff --git
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
index ee5145f284..cfd9d08cb2 100644
---
a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
+++
b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java
@@ -497,7 +497,8 @@ public class OpenSearchTest {
private void createTestHtmlFiles(String bodyContent, int numHtmlDocs, Path
testDocDirectory) throws Exception {
Files.createDirectories(testDocDirectory);
for (int i = 0; i < numHtmlDocs; ++i) {
- String html = "<html><body>" + bodyContent + "</body></html>";
+ String html = "<html><head><meta charset=\"UTF-8\"></head><body>"
+ bodyContent +
+ "</body></html>";
Path p = testDocDirectory.resolve( "test-" + i + ".html");
writeStringToPath(p, html);
}
diff --git
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
index 888396343f..444ef6f051 100644
---
a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
+++
b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java
@@ -88,8 +88,9 @@ class S3PipeIntegrationTest {
for (int i = 0; i < numDocs; ++i) {
String nextFileName = "test-" + i + ".html";
testFiles.add(nextFileName);
- String s = "<html><body>body-of-" + nextFileName +
"</body></html>";
- byte[] bytes = s.getBytes(StandardCharsets.US_ASCII);
+ String s = "<html><head><meta
charset=\"UTF-8\"></head><body>body-of-" +
+ nextFileName + "</body></html>";
+ byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
PutObjectRequest request =
PutObjectRequest.builder().bucket(FETCH_BUCKET).key(nextFileName).build();
RequestBody requestBody = RequestBody.fromBytes(bytes);
s3Client.putObject(request, requestBody);
diff --git
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
index fb903f5b00..f1de37d989 100644
---
a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
+++
b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/JunkFilterEncodingDetector.java
@@ -64,7 +64,7 @@ import org.apache.tika.quality.TextQualityScore;
*
* @since Apache Tika 4.0.0 (TIKA-4720)
*/
-@TikaComponent(name = "junk-filter-encoding-detector")
+@TikaComponent(spi = false, name = "junk-filter-encoding-detector")
public class JunkFilterEncodingDetector implements MetaEncodingDetector {
private static final long serialVersionUID = 1L;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 931b0df0c6..7d84c9c493 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -33,12 +33,10 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
-import org.apache.tika.detect.BOMDetector;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.detect.MetaEncodingDetector;
-import org.apache.tika.detect.MetadataCharsetDetector;
import org.apache.tika.detect.OverrideEncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.io.TikaInputStream;
@@ -61,17 +59,13 @@ public class TikaEncodingDetectorTest extends TikaTest {
EncodingDetector detector =
TikaLoader.loadDefault().loadEncodingDetectors();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector)
detector).getDetectors();
- // 4 base detectors (BOM, Metadata, ML, HtmlEncodingDetector) +
JunkFilter (MetaEncodingDetector)
- assertEquals(5, detectors.size());
- // meta detector is always last (partitioned by
CompositeEncodingDetector)
- assertTrue(detectors.get(4) instanceof MetaEncodingDetector);
- // base detectors — sorted by full class name; check by type
- Set<Class<?>> baseClasses = detectors.subList(0, 4).stream()
- .map(Object::getClass).collect(Collectors.toSet());
- assertTrue(baseClasses.contains(BOMDetector.class));
- assertTrue(baseClasses.contains(MetadataCharsetDetector.class));
- assertTrue(baseClasses.contains(MojibusterEncodingDetector.class));
- assertTrue(baseClasses.contains(HtmlEncodingDetector.class));
+ // TIKA-4683: rolled-back 3.x-style chain (Html, Universal, Icu4j) —
first non-empty wins
+ assertEquals(3, detectors.size());
+ Set<String> baseClassNames = detectors.stream()
+ .map(d -> d.getClass().getName()).collect(Collectors.toSet());
+
assertTrue(baseClassNames.contains(HtmlEncodingDetector.class.getName()));
+
assertTrue(baseClassNames.contains("org.apache.tika.parser.txt.UniversalEncodingDetector"));
+
assertTrue(baseClassNames.contains("org.apache.tika.parser.txt.Icu4jEncodingDetector"));
}
@Test
@@ -87,14 +81,12 @@ public class TikaEncodingDetectorTest extends TikaTest {
assertTrue(detector1 instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors1Children =
((CompositeEncodingDetector) detector1).getDetectors();
- // BOM + Metadata + ML base detectors + JunkFilter meta (html excluded)
- assertEquals(4, detectors1Children.size());
- Set<Class<?>> innerClasses = detectors1Children.subList(0, 3).stream()
- .map(Object::getClass).collect(Collectors.toSet());
- assertTrue(innerClasses.contains(BOMDetector.class));
- assertTrue(innerClasses.contains(MetadataCharsetDetector.class));
- assertTrue(innerClasses.contains(MojibusterEncodingDetector.class));
- assertTrue(detectors1Children.get(3) instanceof MetaEncodingDetector);
+ // TIKA-4683: rolled-back chain (Html, Universal, Icu4j); html
excluded leaves 2.
+ assertEquals(2, detectors1Children.size());
+ Set<String> innerClassNames = detectors1Children.stream()
+ .map(d -> d.getClass().getName()).collect(Collectors.toSet());
+
assertTrue(innerClassNames.contains("org.apache.tika.parser.txt.UniversalEncodingDetector"));
+
assertTrue(innerClassNames.contains("org.apache.tika.parser.txt.Icu4jEncodingDetector"));
assertTrue(detectors.get(1) instanceof OverrideEncodingDetector);
@@ -186,19 +178,20 @@ public class TikaEncodingDetectorTest extends TikaTest {
((AbstractEncodingDetectorParser) encodingDetectingParser)
.getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
- // BOM, Metadata, ML, Html base detectors + JunkFilter meta
- // (ICU4J is excluded but was already not in the default chain)
- assertEquals(5, ((CompositeEncodingDetector)
encodingDetector).getDetectors().size());
+ // TIKA-4683: rolled-back chain (Html, Universal, Icu4j); icu4j
excluded leaves 2.
+ assertEquals(2, ((CompositeEncodingDetector)
encodingDetector).getDetectors().size());
for (EncodingDetector child : ((CompositeEncodingDetector)
encodingDetector)
.getDetectors()) {
assertNotContained("cu4j",
child.getClass().getCanonicalName());
}
}
- // ML handles EBCDIC (IBM500) via structural rules, so CP500 is
detectable
- Metadata metadata = getXML("english.cp500.txt", p).metadata;
- assertNotNull(metadata.get(TikaCoreProperties.DETECTED_ENCODING));
-
+ // TIKA-4683: with the rolled-back 3.x-style chain (Html, Universal,
Icu4j minus
+ // the excluded icu4j), CP500/EBCDIC isn't reliably detected here. 3.x
relied on
+ // a different code path (parser-layer charset honouring) for this
kind of input.
+ // Re-enable when EBCDIC detection lands on a chain detector.
+ // Metadata metadata = getXML("english.cp500.txt", p).metadata;
+ // assertNotNull(metadata.get(TikaCoreProperties.DETECTED_ENCODING));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index dea1a9bc09..181bc0a36d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -54,7 +54,7 @@ public class AutoDetectParserTest extends TikaTest {
// Easy to read constants for the MIME types:
private static final String RAW = "application/octet-stream";
private static final String EXCEL = "application/vnd.ms-excel";
- private static final String HTML = "text/html; charset=windows-1252";
+ private static final String HTML = "text/html; charset=ISO-8859-1";
private static final String PDF = "application/pdf";
private static final String POWERPOINT = "application/vnd.ms-powerpoint";
private static final String KEYNOTE = "application/vnd.apple.keynote";
@@ -62,7 +62,7 @@ public class AutoDetectParserTest extends TikaTest {
private static final String NUMBERS = "application/vnd.apple.numbers";
private static final String CHM = "application/vnd.ms-htmlhelp";
private static final String RTF = "application/rtf";
- private static final String PLAINTEXT = "text/plain; charset=windows-1252";
+ private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
private static final String UTF8TEXT = "text/plain; charset=UTF-8";
private static final String WORD = "application/msword";
private static final String XML = "application/xml";
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectionTest.java
index b954f42235..bf246c20d3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectionTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectionTest.java
@@ -151,7 +151,7 @@ public class HtmlEncodingDetectionTest extends TikaTest {
}
assertEquals(1, (int) tagFrequencies.get("title"));
- assertEquals(12, (int) tagFrequencies.get("meta"));
+ assertEquals(11, (int) tagFrequencies.get("meta"));
assertEquals(12, (int) tagFrequencies.get("link"));
assertEquals(6, (int) tagFrequencies.get("script"));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index e3be158582..a95fa9db5f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -46,7 +46,7 @@ public class RTFParserTest extends TikaTest {
public void testEmbeddedMonster() throws Exception {
Map<Integer, Pair> expected = new HashMap<>();
- expected.put(3, new Pair("Hw.txt", "text/plain;
charset=windows-1252"));
+ expected.put(3, new Pair("Hw.txt", "text/plain; charset=ISO-8859-1"));
expected.put(4, new Pair("embedded-0.doc", "application/msword"));
expected.put(7, new Pair("embedded-1.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8765905ecb..c8a116175c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -252,7 +252,7 @@ public class PDFParserTest extends TikaTest {
metadatas.get(1).get(Metadata.CONTENT_TYPE));
assertImageContentType("image/tiff",
metadatas.get(2).get(Metadata.CONTENT_TYPE));
- assertEquals("text/plain; charset=windows-1252",
metadatas.get(3).get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1",
metadatas.get(3).get(Metadata.CONTENT_TYPE));
assertEquals(TYPE_DOC.toString(),
metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index 178820a317..bec188b8d8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -18,7 +18,6 @@ package org.apache.tika.parser.pkg;
import java.util.List;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
@@ -33,8 +32,6 @@ public class PackageParserTest extends TikaTest {
assertContains("审计压缩",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
}
- @Disabled("CharSoup's discriminative model misclassifies short SJIS
probes; " +
- "re-enable once generative language models provide better
calibrated confidence")
@Test
public void handleEntryNameWithCharsetShiftJIS() throws Exception {
List<Metadata> metadataList =
getRecursiveMetadata("testZipEntryNameCharsetShiftSJIS.zip");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
index ba444a23f7..b558f836a2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
@@ -60,6 +60,18 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-universal</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-icu4j</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
<plugins>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index fa403e583f..f10f1be80d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -28,10 +28,12 @@ import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
@@ -62,7 +64,6 @@ import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
@@ -264,22 +265,32 @@ public class HtmlParserTest extends TikaTest {
new JSoupParser().parse(tis,
new BodyContentHandler(), metadata, new ParseContext());
}
- // Per the HTML Living Standard, "iso-8859-1" is an alias for
windows-1252.
+ // WHATWG Encoding Standard maps the iso-8859-1 label to windows-1252.
assertEquals("windows-1252", metadata.get(Metadata.CONTENT_ENCODING));
}
- // testHtml5Charset (TIKA-892) → HtmlEncodingDetectionTest in
- // tika-parsers-standard-integration-tests (needs the full default chain).
-
+ /**
+ * Test case for TIKA-892
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
+ */
+ @Test
+ public void testHtml5Charset() throws Exception {
+ String test = "<html><head><meta charset=\"ISO-8859-15\" />" +
+ "<title>the name is \u00e1ndre</title>" +
"</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis =
TikaInputStream.get(test.getBytes(ISO_8859_1))) {
+ new JSoupParser().parse(tis,
+ new BodyContentHandler(), metadata, new ParseContext());
+ }
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+ }
/**
- * TIKA-334. After HTML stripping the probe is 2 bytes (the title body)
- * — too short for the meta arbiter to score reliably. Re-enable once
- * an arbiter trusting STRUCTURAL over short-probe statistical lands.
+ * Test case for TIKA-334
*
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
*/
- @Disabled("blocked on short-probe arbitration; see javadoc")
@Test
public void testDetectOfCharset() throws Exception {
String test =
"<html><head><title>\u017d</title></head><body></body></html>";
@@ -316,8 +327,7 @@ public class HtmlParserTest extends TikaTest {
new JSoupParser().parse(tis,
new BodyContentHandler(), metadata, new ParseContext());
}
- // Per the HTML Living Standard, "iso-8859-1" is an alias for
windows-1252.
- assertEquals("windows-1252", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
}
/**
@@ -361,8 +371,34 @@ public class HtmlParserTest extends TikaTest {
assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
}
- // testHttpEquivCharsetFunkyAttributes (TIKA-349) →
HtmlEncodingDetectionTest.
+ /**
+ * Test case for TIKA-349
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
+ */
+ @Test
+ public void testHttpEquivCharsetFunkyAttributes() throws Exception {
+ String test1 = "<html><head><meta http-equiv=\"content-type\"" +
+ " content=\"text/html; charset=ISO-8859-15;
charset=iso-8859-15\" />" +
+ "<title>the name is \u00e1ndre</title>" +
"</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis =
TikaInputStream.get(test1.getBytes(ISO_8859_1))) {
+ new JSoupParser().parse(tis,
+ new BodyContentHandler(), metadata, new ParseContext());
+ }
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+ // Some HTML pages have errors like ';;' versus '; ' as separator
+ String test2 = "<html><head><meta http-equiv=\"content-type\"" +
+ " content=\"text/html;;charset=ISO-8859-15\" />" +
+ "<title>the name is \u00e1ndre</title>" +
"</head><body></body></html>";
+ metadata = new Metadata();
+ try (TikaInputStream tis =
TikaInputStream.get(test2.getBytes(ISO_8859_1))) {
+ new JSoupParser().parse(tis,
+ new BodyContentHandler(), metadata, new ParseContext());
+ }
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+ }
/**
* Test case for TIKA-350
@@ -388,13 +424,24 @@ public class HtmlParserTest extends TikaTest {
new JSoupParser().parse(tis,
new BodyContentHandler(), metadata, new ParseContext());
}
- // Per the HTML Living Standard, "iso-8859-1" is an alias for
windows-1252.
- assertEquals("windows-1252", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
}
- // testMetaHttpEquivWithLotsOfPreambleText (TIKA-357) →
HtmlEncodingDetectionTest.
+ /**
+ * Test case for TIKA-357
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
+ */
+ @Test
+ public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
+ String path = "/test-documents/big-preamble.html";
+ Metadata metadata = new Metadata();
+ new JSoupParser().parse(getResourceAsStream(path), new
BodyContentHandler(), metadata,
+ new ParseContext());
+ assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
+ }
/**
* Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
@@ -835,8 +882,18 @@ public class HtmlParserTest extends TikaTest {
assertNotNull(content);
}
- // testNoisyMetaCharsetHeaders (TIKA-1001) → HtmlEncodingDetectionTest.
-
+ //TIKA-1001
+ @Test
+ public void testNoisyMetaCharsetHeaders() throws Exception {
+ Tika tika = new Tika();
+ String hit = "\u0623\u0639\u0631\u0628";
+
+ for (int i = 1; i <= 4; i++) {
+ String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i
+ ".html";
+ String content = tika.parseToString(getResourceAsStream(fileName));
+ assertTrue(content.contains(hit), "testing: " + fileName);
+ }
+ }
/**
* Test case for TIKA-820: Locator is unset for HTML parser
@@ -944,8 +1001,7 @@ public class HtmlParserTest extends TikaTest {
}
assertEquals("text/html; charset=UTF-ELEVEN",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- // "UTF-ELEVEN" is not a valid charset; no declaration available, ML
defaults to windows-1252.
- assertEquals("text/html; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/html; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
test = "<html><head><meta http-equiv=\"content-type\"
content=\"application/pdf\">" +
"</head><title>title</title><body>body</body></html>";
@@ -957,8 +1013,7 @@ public class HtmlParserTest extends TikaTest {
metadata, new ParseContext());
}
assertEquals("application/pdf",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- // No valid charset declaration; ML defaults to windows-1252 for pure
ASCII content.
- assertEquals("text/html; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/html; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
//test two content values
test =
@@ -973,8 +1028,7 @@ public class HtmlParserTest extends TikaTest {
metadata, new ParseContext());
}
assertEquals("application/pdf",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- // No valid charset declaration; ML defaults to windows-1252 for pure
ASCII content.
- assertEquals("text/html; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/html; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
}
@Test
@@ -994,7 +1048,7 @@ public class HtmlParserTest extends TikaTest {
assertEquals("text/html; charset=iso-8859-1",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- // Per the HTML Living Standard, "iso-8859-1" is an alias for
windows-1252.
+ // WHATWG Encoding Standard maps the iso-8859-1 label to windows-1252.
assertEquals("application/xhtml+xml; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
@@ -1014,8 +1068,7 @@ public class HtmlParserTest extends TikaTest {
assertEquals("text/html; charset=iso-NUMBER_SEVEN",
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
- // "iso-NUMBER_SEVEN" is not a valid charset; ML defaults to
windows-1252 for pure ASCII.
- assertEquals("application/xhtml+xml; charset=windows-1252",
+ assertEquals("application/xhtml+xml; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
}
@@ -1054,10 +1107,54 @@ public class HtmlParserTest extends TikaTest {
assertEquals(url, links.get(0));
}
- // testAllHeadElements (TIKA-1980) → HtmlEncodingDetectionTest (tag
- // counts depend on detected charset).
- // testSkippingCommentsInEncodingDetection → HtmlEncodingDetectionTest.
+ @Test
+ public void testAllHeadElements() throws Exception {
+ //TIKA-1980
+ // IdentityHtmlMapper is needed to extract <script> tags
+ ParseContext context = new ParseContext();
+ context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html");
+
+ final Map<String, Integer> tagFrequencies = new HashMap<>();
+ String path = "/test-documents/testHTML_head.html";
+ try (TikaInputStream tis = getResourceAsStream(path)) {
+ ContentHandler tagCounter = new DefaultHandler() {
+ @Override
+ public void startElement(String uri, String local, String name,
+ Attributes attributes) throws
SAXException {
+ int count = tagFrequencies.getOrDefault(name, 0);
+ tagFrequencies.put(name, count + 1);
+ }
+ };
+ new JSoupParser().parse(tis, tagCounter, metadata, context);
+ }
+
+ assertEquals(1, (int) tagFrequencies.get("title"));
+ assertEquals(11, (int) tagFrequencies.get("meta"));
+ assertEquals(12, (int) tagFrequencies.get("link"));
+ assertEquals(6, (int) tagFrequencies.get("script"));
+ }
+
+ @Test
+ public void testSkippingCommentsInEncodingDetection() throws Exception {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 10000; i++) {
+ sb.append(" ");
+ }
+ byte[] bytes = new String("<html><head>" +
+ "<!--<meta http-equiv=\"Content-Type\" " +
+ "content=\"text/html; charset=ISO-8859-1\"> -->\n" +
+ " <meta http-equiv=\"Content-Type\" content=\"text/html;
charset=utf-8\" />" +
+ "</head>" + sb.toString() + "<body>" + "有什么需要我帮你的" +
"</body></html>")
+ .getBytes(StandardCharsets.UTF_8);
+ XMLResult r;
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ r = getXML(tis, AUTO_DETECT_PARSER, new Metadata());
+ }
+ assertContains("有什么需要我帮你的", r.xml);
+ }
@Test
@Disabled("until we fix TIKA-1896")
@@ -1156,8 +1253,13 @@ public class HtmlParserTest extends TikaTest {
public String getEncoding(EncodingDetector detector, Path p) throws
IOException {
try (TikaInputStream tis = TikaInputStream.get(p)) {
- List<EncodingResult> results = detector.detect(tis, new
Metadata(), new ParseContext());
- return results.isEmpty() ? "NULL" :
results.get(0).getCharset().toString();
+ List<org.apache.tika.detect.EncodingResult> results =
+ detector.detect(tis, new Metadata(), new ParseContext());
+ if (results.isEmpty()) {
+ return "NULL";
+ } else {
+ return results.get(0).getCharset().toString();
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/pom.xml
index 8165908282..28093c6e6a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/pom.xml
@@ -63,6 +63,18 @@
<scope>test</scope>
<type>test-jar</type>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-universal</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-icu4j</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
<plugins>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
index 4231df7044..f668eb8f9a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml
@@ -52,6 +52,12 @@
<artifactId>tika-encoding-detector-icu4j</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-encoding-detector-universal</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-xml-module</artifactId>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 7a661ccf0e..ec3bead2c8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -186,8 +186,7 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
expected.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
expected.add("application/pdf");
expected.add("application/xml");
- // CRLF line endings in this embedded text file trigger the
ISO→Windows upgrade heuristic
- expected.add("text/plain; charset=windows-1252");
+ expected.add("text/plain; charset=ISO-8859-1");
//test that we're correctly handling attachment variants for
// files created by WPS 表格 (https://www.wps.cn/)
for (String suffix : new String[]{
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index d206cae649..53ae1e97ee 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -104,6 +104,14 @@ public class ZipParser extends AbstractArchiveParser {
private static final Set<MediaType> SUPPORTED_TYPES = MediaType.set(ZIP,
JAR);
+ /**
+ * Minimum byte count we feed to the encoding detector when guessing the
+ * charset of a non-Unicode ZIP entry name. Short names (e.g., a few bytes
+ * of Shift_JIS) carry too little signal for statistical detectors; we
+ * cyclically repeat the bytes up to this length to stabilise detection.
+ */
+ private static final int MIN_BYTES_FOR_DETECTING_CHARSET = 100;
+
/**
* Maximum number of entries to record in integrity check metadata fields.
* Prevents excessive metadata in ZIPs with many discrepancies.
@@ -560,8 +568,22 @@ public class ZipParser extends AbstractArchiveParser {
// If charset detection is enabled, try to detect and decode
if (config.isDetectCharsetsInEntryNames()) {
byte[] entryName = entry.getRawName();
+ // Extend short entry names before detection: statistical detectors
+ // (e.g. UniversalEncodingDetector, Icu4j) need enough material to
+ // make a confident call. Cyclically repeat the bytes so the
+ // detector still sees the same byte distribution.
+ byte[] extendedEntryName = entryName;
+ if (entryName != null && 0 < entryName.length
+ && entryName.length < MIN_BYTES_FOR_DETECTING_CHARSET) {
+ int len = entryName.length
+ * (MIN_BYTES_FOR_DETECTING_CHARSET / entryName.length);
+ extendedEntryName = new byte[len];
+ for (int i = 0; i < len; i++) {
+ extendedEntryName[i] = entryName[i % entryName.length];
+ }
+ }
- try (TikaInputStream detectStream =
TikaInputStream.get(entryName)) {
+ try (TikaInputStream detectStream =
TikaInputStream.get(extendedEntryName)) {
List<EncodingResult> encResults =
getEncodingDetector().detect(detectStream,
parentMetadata, context);
Charset candidate = encResults.isEmpty() ? null :
encResults.get(0).getCharset();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index a32d063223..1d319e8606 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -101,7 +101,7 @@ public class TextAndCSVParserTest extends TikaTest {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(TikaInputStream.get(CSV_UTF8), PARSER,
metadata);
assertEquals("comma",
xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
- assertMediaTypeEquals("csv", "windows-1252", "comma",
+ assertMediaTypeEquals("csv", "ISO-8859-1", "comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_COLUMNS));
@@ -126,7 +126,7 @@ public class TextAndCSVParserTest extends TikaTest {
metadata.set(Metadata.CONTENT_TYPE, "text/csv");
XMLResult xmlResult = getXML(TikaInputStream.get(CSV_UTF8), PARSER,
metadata);
assertEquals("comma",
xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
- assertMediaTypeEquals("csv", "windows-1252", "comma",
+ assertMediaTypeEquals("csv", "ISO-8859-1", "comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
}
@@ -160,7 +160,7 @@ public class TextAndCSVParserTest extends TikaTest {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(TikaInputStream.get(TSV_UTF8), PARSER,
metadata);
assertEquals("tab",
xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
- assertMediaTypeEquals("tsv", "windows-1252", "tab",
+ assertMediaTypeEquals("tsv", "ISO-8859-1", "tab",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
}
@@ -191,7 +191,7 @@ public class TextAndCSVParserTest extends TikaTest {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(TikaInputStream.get(csv), PARSER,
metadata);
assertNull(xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
- assertEquals("text/plain; charset=windows-1252",
+ assertEquals("text/plain; charset=ISO-8859-1",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContains("the,quick", xmlResult.xml);
}
@@ -225,7 +225,7 @@ public class TextAndCSVParserTest extends TikaTest {
XMLResult xmlResult =
getXML(TikaInputStream.get(sb.toString().getBytes(StandardCharsets.UTF_8)),
PARSER, metadata);
- assertMediaTypeEquals("csv", "windows-1252", "comma",
+ assertMediaTypeEquals("csv", "ISO-8859-1", "comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
}
@@ -233,7 +233,7 @@ public class TextAndCSVParserTest extends TikaTest {
@Test
public void testSubclassingMimeTypesRemain() throws Exception {
XMLResult r = getXML("testVCalendar.vcs");
- assertEquals("text/x-vcalendar; charset=windows-1252",
r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/x-vcalendar; charset=ISO-8859-1",
r.metadata.get(Metadata.CONTENT_TYPE));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index e37f42eed9..128f653212 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -19,9 +19,7 @@ package org.apache.tika.parser.txt;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.StringWriter;
@@ -30,11 +28,9 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
@@ -57,8 +53,7 @@ public class TXTParserTest extends TikaTest {
}
String content = writer.toString();
- // Pure ASCII — detected as windows-1252 (the HTML5/WHATWG default for
8-bit Western)
- assertEquals("text/plain; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
// TIKA-501: Remove language detection from TXTParser
assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
@@ -92,7 +87,7 @@ public class TXTParserTest extends TikaTest {
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
parser.parse(tis, handler, metadata, new ParseContext());
}
- assertEquals("text/plain; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=UTF-8",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("\n", handler.toString());
}
@@ -105,9 +100,6 @@ public class TXTParserTest extends TikaTest {
*/
@Test
public void testLatinDetectionHeuristics() throws Exception {
- // Previously tested CR/LF heuristics specific to
UniversalEncodingDetector.
- // The ML-based detector defaults to windows-1252 for pure ASCII
regardless of
- // line endings (CRLF_TO_WINDOWS is a secondary confirmation, not the
primary path).
String windows = "test\r\n";
String unix = "test\n";
String euro = "test \u20ac\n";
@@ -119,19 +111,20 @@ public class TXTParserTest extends TikaTest {
parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
assertEquals("text/plain; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UniversalEncodingDetector",
metadata.get(TikaCoreProperties.ENCODING_DETECTOR));
+ assertEquals("windows-1252",
metadata.get(TikaCoreProperties.DETECTED_ENCODING));
metadata = new Metadata();
try (TikaInputStream tis =
TikaInputStream.get(unix.getBytes("ISO-8859-15"))) {
parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
- assertEquals("text/plain; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
try (TikaInputStream tis =
TikaInputStream.get(euro.getBytes("ISO-8859-15"))) {
parser.parse(tis, new DefaultHandler(), metadata, new
ParseContext());
}
- // 7 bytes with one high byte (0xA4) — just verify detection succeeds
- assertNotNull(metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-15",
metadata.get(Metadata.CONTENT_TYPE));
}
/**
@@ -156,18 +149,23 @@ public class TXTParserTest extends TikaTest {
*/
@Test
public void testUseIncomingCharsetAsHint() throws Exception {
- // u00e1 is latin small letter a with acute — 17 bytes, one high byte
(0xE1).
- // The ML detector returns a Windows Latin variant; incoming charset
hints are
- // not used to override detection in the new pipeline.
+ // Could be ISO 8859-1 or ISO 8859-15 or ...
+ // u00e1 is latin small letter a with acute
final String test2 = "the name is \u00e1ndre";
Metadata metadata = new Metadata();
try (TikaInputStream tis =
TikaInputStream.get(test2.getBytes(ISO_8859_1))) {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
}
- // Short probe with one high byte — detector returns a Windows Latin
variant
- assertNotNull(metadata.get(Metadata.CONTENT_TYPE));
- assertNotNull(metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
// deprecated
+
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
+ try (TikaInputStream tis =
TikaInputStream.get(test2.getBytes(ISO_8859_1))) {
+ parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
+ }
+ assertEquals("text/plain; charset=ISO-8859-15",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
// deprecated
}
/**
@@ -177,16 +175,26 @@ public class TXTParserTest extends TikaTest {
*/
@Test
public void testUsingCharsetInContentTypeHeader() throws Exception {
- // u00e1 is latin small letter a with acute — 17 bytes, one high byte
(0xE1).
- // Incoming charset in content-type is not used to override ML
detection.
+ // Could be ISO 8859-1 or ISO 8859-15 or ...
+ // u00e1 is latin small letter a with acute
final String test2 = "the name is \u00e1ndre";
Metadata metadata = new Metadata();
try (TikaInputStream tis =
TikaInputStream.get(test2.getBytes(ISO_8859_1))) {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
}
- assertNotNull(metadata.get(Metadata.CONTENT_TYPE));
- assertNotNull(metadata.get(Metadata.CONTENT_ENCODING));
+ parser.parse(TikaInputStream.get(test2.getBytes(ISO_8859_1)), new
BodyContentHandler(),
+ metadata, new ParseContext());
+ assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
// deprecated
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
+ try (TikaInputStream tis =
TikaInputStream.get(test2.getBytes(ISO_8859_1))) {
+ parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
+ }
+ assertEquals("text/html; charset=ISO-8859-15",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
// deprecated
}
private void assertExtractText(String msg, String expected, byte[] input)
throws Exception {
@@ -238,11 +246,7 @@ public class TXTParserTest extends TikaTest {
parser.parse(getResourceAsStream("/test-documents/english.cp500.txt"),
new WriteOutContentHandler(writer), metadata, new
ParseContext());
- // IBM500 and IBM1047 share 247 of 256 byte mappings and are
indistinguishable
- // for normal Latin text — accept either.
- String ct = metadata.get(Metadata.CONTENT_TYPE);
- assertTrue(ct.equals("text/plain; charset=IBM500") ||
ct.equals("text/plain; charset=IBM1047"),
- "Expected IBM500 or IBM1047, got: " + ct);
+ assertEquals("text/plain; charset=IBM500",
metadata.get(Metadata.CONTENT_TYPE));
// Additional check that it isn't too eager on short blocks of text
metadata = new Metadata();
@@ -252,7 +256,7 @@ public class TXTParserTest extends TikaTest {
parser.parse(tis, new WriteOutContentHandler(writer), metadata,
new ParseContext());
}
- assertEquals("text/plain; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
}
/**
@@ -268,10 +272,10 @@ public class TXTParserTest extends TikaTest {
try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
}
- assertEquals("text/plain; charset=windows-1252",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1",
metadata.get(Metadata.CONTENT_TYPE));
- // TIKA-868: MetadataCharsetDetector (tika-core) reads the charset
from Content-Type
- // and returns it as DECLARATIVE, which CharSoup prefers over the
statistical windows-1252.
+ // Now verify that if we tell the parser the encoding is UTF-8, that's
what
+ // we get back (see TIKA-868)
metadata.set(Metadata.CONTENT_TYPE, "application/binary;
charset=UTF-8");
try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) {
parser.parse(tis, new BodyContentHandler(), metadata, new
ParseContext());
@@ -283,23 +287,7 @@ public class TXTParserTest extends TikaTest {
@Test
public void testSubclassingMimeTypesRemain() throws Exception {
XMLResult r = getXML("testVCalendar.vcs");
- assertEquals("text/x-vcalendar; charset=windows-1252",
r.metadata.get(Metadata.CONTENT_TYPE));
- }
-
- // TIKA-3516, TIKA-3525, TIKA-1236
- @Test
- public void testIgnoreCharset() throws Exception {
- AutoDetectParser parser = (AutoDetectParser) TikaLoader.load(
- getConfigPath(TXTParserTest.class,
"tika-config-ignore-charset.json"))
- .loadAutoDetectParser();
-
- Metadata m = new Metadata();
- m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
- assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt", parser,
m).xml);
-
- m = new Metadata();
- m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
- assertContains("Please check your email",
getXML("test_ignore_IBM420.html", parser, m).xml);
+ assertEquals("text/x-vcalendar; charset=ISO-8859-1",
r.metadata.get(Metadata.CONTENT_TYPE));
}
}