This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 88a6fc5234 TIKA-4752 -- improve zip name detection (#2869)
88a6fc5234 is described below
commit 88a6fc52346b97a8b06fad2b430a13c8d8420ffb
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 10:19:34 2026 -0400
TIKA-4752 -- improve zip name detection (#2869)
---
.../tika/detect/MetadataCharsetDetector.java | 27 +++++-
.../tika/detect/MetadataCharsetDetectorTest.java | 101 +++++++++++++++++++
.../java/org/apache/tika/parser/pkg/ZipParser.java | 26 ++++-
.../tika/parser/pkg/ZipEntryNameEncodingTest.java | 107 +++++++++++++++++++++
4 files changed, 252 insertions(+), 9 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
index 13102ea01d..1581bc9b74 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -33,10 +34,13 @@ import org.apache.tika.parser.ParseContext;
* reading any bytes from the stream. Returns a single
* {@link EncodingResult.ResultType#DECLARATIVE} result when a charset is
found.
*
- * <p>Two metadata keys are consulted in order:
+ * <p>Three metadata keys are consulted in order:
* <ol>
* <li>{@link Metadata#CONTENT_TYPE} — the {@code charset} parameter of the
* HTTP/MIME Content-Type header (e.g. {@code text/html;
charset=UTF-8}).</li>
+ * <li>{@link TikaCoreProperties#CONTENT_TYPE_HINT} — the {@code charset}
parameter
+ * of a content-type a source <em>claimed</em> for the bytes (e.g. an
HTML
+ * {@code <meta>} tag, or a zip entry's UTF-8 (EFS) flag). A hint, not a
verdict.</li>
* <li>{@link Metadata#CONTENT_ENCODING} — a bare charset label set by
parsers
* such as {@code RFC822Parser}, which splits Content-Type into a bare
* media-type key and a separate charset key.</li>
@@ -56,6 +60,9 @@ public class MetadataCharsetDetector implements
EncodingDetector {
public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
ParseContext context) throws
IOException {
Charset cs = charsetFromContentType(metadata);
+ if (cs == null) {
+ cs = charsetFromContentTypeHint(metadata);
+ }
if (cs == null) {
cs = charsetFromContentEncoding(metadata);
}
@@ -71,7 +78,20 @@ public class MetadataCharsetDetector implements
EncodingDetector {
* {@link Metadata#CONTENT_TYPE} value, or {@code null} if absent or
unparseable.
*/
public static Charset charsetFromContentType(Metadata metadata) {
- String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ return charsetFromMediaType(metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Returns the charset named in the {@code charset} parameter of the
+ * {@link TikaCoreProperties#CONTENT_TYPE_HINT} value — a content-type a
source
+ * claimed for the bytes (HTML {@code <meta>}, a zip entry's UTF-8 flag,
...) —
+ * or {@code null} if absent or unparseable.
+ */
+ public static Charset charsetFromContentTypeHint(Metadata metadata) {
+ return
charsetFromMediaType(metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+ }
+
+ private static Charset charsetFromMediaType(String contentType) {
if (contentType == null) {
return null;
}
@@ -79,8 +99,7 @@ public class MetadataCharsetDetector implements
EncodingDetector {
if (mediaType == null) {
return null;
}
- String label = mediaType.getParameters().get("charset");
- return parseCharset(label);
+ return parseCharset(mediaType.getParameters().get("charset"));
}
/**
diff --git
a/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java
b/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java
new file mode 100644
index 0000000000..b8ca8531b9
--- /dev/null
+++
b/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+
+public class MetadataCharsetDetectorTest {
+
+ private final MetadataCharsetDetector detector = new
MetadataCharsetDetector();
+
+ private Charset detect(Metadata metadata) throws IOException {
+ try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+ List<EncodingResult> results = detector.detect(tis, metadata, new
ParseContext());
+ if (results.isEmpty()) {
+ return null;
+ }
+ assertEquals(EncodingResult.ResultType.DECLARATIVE,
results.get(0).getResultType());
+ return results.get(0).getCharset();
+ }
+ }
+
+ @Test
+ public void testContentTypeHint() throws Exception {
+ // TIKA-4752: the charset claimed via CONTENT_TYPE_HINT (e.g. a zip
entry's
+ // UTF-8/EFS flag, recorded as text/plain; charset=UTF-8) is consumed.
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain;
charset=UTF-8");
+ assertEquals(StandardCharsets.UTF_8, detect(m));
+ }
+
+ @Test
+ public void testContentType() throws Exception {
+ Metadata m = new Metadata();
+ // ISO-8859-1 normalizes to its windows-1252 superset (WHATWG),
existing behavior.
+ m.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+ assertEquals(Charset.forName("windows-1252"), detect(m));
+ }
+
+ @Test
+ public void testContentEncoding() throws Exception {
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_ENCODING, "Shift_JIS");
+ assertEquals(Charset.forName("Shift_JIS"), detect(m));
+ }
+
+ @Test
+ public void testContentTypeWinsOverHint() throws Exception {
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_TYPE, "text/plain; charset=UTF-16");
+ m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain;
charset=UTF-8");
+ assertEquals(StandardCharsets.UTF_16, detect(m));
+ }
+
+ @Test
+ public void testHintWinsOverContentEncoding() throws Exception {
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain;
charset=UTF-8");
+ m.set(Metadata.CONTENT_ENCODING, "Shift_JIS");
+ assertEquals(StandardCharsets.UTF_8, detect(m));
+ }
+
+ @Test
+ public void testNoDeclarationIsEmpty() throws Exception {
+ assertEquals(null, detect(new Metadata()));
+ // A content-type with no charset parameter is not a declaration.
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_TYPE, "text/plain");
+ assertEquals(null, detect(m));
+ // An unparseable charset label is ignored, not thrown.
+ Metadata bad = new Metadata();
+ bad.set(Metadata.CONTENT_ENCODING, "not-a-charset");
+ assertTrue(detect(bad) == null);
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index d01fa3ab41..fe9b5236d6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -22,6 +22,7 @@ import static org.apache.tika.detect.zip.PackageConstants.ZIP;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.nio.file.attribute.FileTime;
import java.util.ArrayList;
import java.util.Collections;
@@ -468,7 +469,7 @@ public class ZipParser extends AbstractArchiveParser {
ZipParserConfig config)
throws SAXException, IOException, TikaException {
- String name = detectEntryName(entry, parentMetadata, context, config);
+ String name = detectEntryName(entry, context, config);
if (entry.getGeneralPurposeBit().usesEncryption()) {
handleEncryptedEntry(name, parentMetadata, xhtml);
@@ -513,7 +514,7 @@ public class ZipParser extends AbstractArchiveParser {
ZipParserConfig config)
throws SAXException, IOException, TikaException {
- String name = detectEntryName(entry, parentMetadata, context, config);
+ String name = detectEntryName(entry, context, config);
if (!zis.canReadEntryData(entry)) {
if (entry.getGeneralPurposeBit().usesEncryption()) {
@@ -549,22 +550,37 @@ public class ZipParser extends AbstractArchiveParser {
}
}
- private String detectEntryName(ZipArchiveEntry entry, Metadata
parentMetadata,
- ParseContext context, ZipParserConfig
config) throws IOException {
+ private String detectEntryName(ZipArchiveEntry entry, ParseContext context,
+ ZipParserConfig config) throws IOException
{
// If user specified an encoding, decode raw bytes with that charset
// This avoids needing to reopen the ZipFile with a different charset
if (config.getEntryEncoding() != null) {
return new String(entry.getRawName(), config.getEntryEncoding());
}
+ // A zip only ever declares a name as UTF-8 (it can't name a legacy
charset),
+ // two ways. The Unicode extra field carries a CRC-validated UTF-8
name -- that
+ // CRC check is the evaluation, so trust commons-compress's getName().
+ if (entry.getNameSource() ==
ZipArchiveEntry.NameSource.UNICODE_EXTRA_FIELD) {
+ return entry.getName();
+ }
+
// If charset detection is enabled, try to detect and decode.
// Mojibuster handles short inputs natively (zip filenames are often
// 9-30 bytes); no byte-extension trick needed.
if (config.isDetectCharsetsInEntryNames()) {
byte[] entryName = entry.getRawName();
+ // The EFS flag (general purpose bit 11) also declares UTF-8, but
is
+ // unvalidated. Record it as a content-type hint for the detector
to
+ // evaluate against the bytes, not trust outright.
+ Metadata nameMetadata = new Metadata();
+ if (entry.getNameSource() ==
ZipArchiveEntry.NameSource.NAME_WITH_EFS_FLAG) {
+ nameMetadata.set(TikaCoreProperties.CONTENT_TYPE_HINT,
+ new MediaType(MediaType.TEXT_PLAIN,
StandardCharsets.UTF_8).toString());
+ }
try (TikaInputStream detectStream =
TikaInputStream.get(entryName)) {
List<EncodingResult> encResults =
- getEncodingDetector().detect(detectStream,
parentMetadata, context);
+ getEncodingDetector(context).detect(detectStream,
nameMetadata, context);
Charset candidate = encResults.isEmpty() ? null :
encResults.get(0).getDecodeAs();
if (candidate != null) {
return new String(entry.getRawName(), candidate);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java
new file mode 100644
index 0000000000..a8fe4e58be
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.CompositeEncodingDetector;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.MetadataCharsetDetector;
+import org.apache.tika.detect.OverrideEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * TIKA-4752: a zip can only declare an entry name as UTF-8 (never a legacy
charset),
+ * two ways -- the EFS flag (general purpose bit 11) and the Unicode path
extra field.
+ * ZipParser must honor both.
+ */
+public class ZipEntryNameEncodingTest extends TikaTest {
+
+ private static final String LATIN = "café-Köln-Süß.txt";
+ private static final String CJK = "日本語.txt";
+
+ @Test
+ public void testEfsFlagHint() throws Exception {
+ // Deterministic + discriminating: MetadataCharsetDetector consumes the
+ // EFS->UTF-8 hint; the override garbles anything it doesn't catch. So
only the
+ // hint yields UTF-8 -- an empty-returning detector wouldn't isolate
it, because
+ // ZipParser would fall back to getName(), already UTF-8 for a flagged
entry.
+ ParseContext context = new ParseContext();
+ context.set(EncodingDetector.class, new
CompositeEncodingDetector(List.of(
+ new MetadataCharsetDetector(),
+ new
OverrideEncodingDetector(Charset.forName("windows-1252")))));
+ assertEquals(LATIN, entryName(efsZip(LATIN), context));
+ }
+
+ @Test
+ public void testUnicodeExtraField() throws Exception {
+ // CRC-validated UTF-8 name in the extra field; the main-header name
is a garbled
+ // CP437 fallback. We must use the extra-field name, not detect the
raw bytes.
+ assertEquals(CJK, entryName(unicodeExtraFieldZip(CJK), new
ParseContext()));
+ }
+
+ private String entryName(byte[] zipBytes, ParseContext context) throws
Exception {
+ try (TikaInputStream tis = TikaInputStream.get(zipBytes)) {
+ List<Metadata> list = getRecursiveMetadata(tis, new Metadata(),
context, false);
+ assertEquals(2, list.size());
+ return list.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ }
+ }
+
+ private static byte[] efsZip(String name) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(bos)) {
+ zos.setEncoding("UTF-8");
+ zos.setUseLanguageEncodingFlag(true);
+
zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER);
+ writeEntry(zos, name);
+ }
+ return bos.toByteArray();
+ }
+
+ private static byte[] unicodeExtraFieldZip(String name) throws IOException
{
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(bos)) {
+ zos.setEncoding("Cp437");
+ zos.setUseLanguageEncodingFlag(false);
+
zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.ALWAYS);
+ writeEntry(zos, name);
+ }
+ return bos.toByteArray();
+ }
+
+ private static void writeEntry(ZipArchiveOutputStream zos, String name)
throws IOException {
+ ZipArchiveEntry entry = new ZipArchiveEntry(name);
+ zos.putArchiveEntry(entry);
+ zos.write("hello".getBytes(StandardCharsets.US_ASCII));
+ zos.closeArchiveEntry();
+ }
+}