(tika) branch main updated: TIKA-4752 -- improve zip name detection (#2869)

tallison Fri, 05 Jun 2026 07:19:50 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 88a6fc5234 TIKA-4752 -- improve zip name detection (#2869)
88a6fc5234 is described below

commit 88a6fc52346b97a8b06fad2b430a13c8d8420ffb
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 10:19:34 2026 -0400

    TIKA-4752 -- improve zip name detection (#2869)
---
 .../tika/detect/MetadataCharsetDetector.java       |  27 +++++-
 .../tika/detect/MetadataCharsetDetectorTest.java   | 101 +++++++++++++++++++
 .../java/org/apache/tika/parser/pkg/ZipParser.java |  26 ++++-
 .../tika/parser/pkg/ZipEntryNameEncodingTest.java  | 107 +++++++++++++++++++++
 4 files changed, 252 insertions(+), 9 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
index 13102ea01d..1581bc9b74 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
@@ -25,6 +25,7 @@ import java.util.List;
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 
@@ -33,10 +34,13 @@ import org.apache.tika.parser.ParseContext;
  * reading any bytes from the stream.  Returns a single
  * {@link EncodingResult.ResultType#DECLARATIVE} result when a charset is 
found.
  *
- * <p>Two metadata keys are consulted in order:
+ * <p>Three metadata keys are consulted in order:
  * <ol>
  *   <li>{@link Metadata#CONTENT_TYPE} — the {@code charset} parameter of the
  *       HTTP/MIME Content-Type header (e.g. {@code text/html; 
charset=UTF-8}).</li>
+ *   <li>{@link TikaCoreProperties#CONTENT_TYPE_HINT} — the {@code charset} 
parameter
+ *       of a content-type a source <em>claimed</em> for the bytes (e.g. an 
HTML
+ *       {@code <meta>} tag, or a zip entry's UTF-8 (EFS) flag). A hint, not a 
verdict.</li>
  *   <li>{@link Metadata#CONTENT_ENCODING} — a bare charset label set by 
parsers
  *       such as {@code RFC822Parser}, which splits Content-Type into a bare
  *       media-type key and a separate charset key.</li>
@@ -56,6 +60,9 @@ public class MetadataCharsetDetector implements 
EncodingDetector {
     public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
                                        ParseContext context) throws 
IOException {
         Charset cs = charsetFromContentType(metadata);
+        if (cs == null) {
+            cs = charsetFromContentTypeHint(metadata);
+        }
         if (cs == null) {
             cs = charsetFromContentEncoding(metadata);
         }
@@ -71,7 +78,20 @@ public class MetadataCharsetDetector implements 
EncodingDetector {
      * {@link Metadata#CONTENT_TYPE} value, or {@code null} if absent or 
unparseable.
      */
     public static Charset charsetFromContentType(Metadata metadata) {
-        String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        return charsetFromMediaType(metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    /**
+     * Returns the charset named in the {@code charset} parameter of the
+     * {@link TikaCoreProperties#CONTENT_TYPE_HINT} value — a content-type a 
source
+     * claimed for the bytes (HTML {@code <meta>}, a zip entry's UTF-8 flag, 
...) —
+     * or {@code null} if absent or unparseable.
+     */
+    public static Charset charsetFromContentTypeHint(Metadata metadata) {
+        return 
charsetFromMediaType(metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+    }
+
+    private static Charset charsetFromMediaType(String contentType) {
         if (contentType == null) {
             return null;
         }
@@ -79,8 +99,7 @@ public class MetadataCharsetDetector implements 
EncodingDetector {
         if (mediaType == null) {
             return null;
         }
-        String label = mediaType.getParameters().get("charset");
-        return parseCharset(label);
+        return parseCharset(mediaType.getParameters().get("charset"));
     }
 
     /**
diff --git 
a/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java
 
b/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java
new file mode 100644
index 0000000000..b8ca8531b9
--- /dev/null
+++ 
b/tika-core/src/test/java/org/apache/tika/detect/MetadataCharsetDetectorTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+
+public class MetadataCharsetDetectorTest {
+
+    private final MetadataCharsetDetector detector = new 
MetadataCharsetDetector();
+
+    private Charset detect(Metadata metadata) throws IOException {
+        try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+            List<EncodingResult> results = detector.detect(tis, metadata, new 
ParseContext());
+            if (results.isEmpty()) {
+                return null;
+            }
+            assertEquals(EncodingResult.ResultType.DECLARATIVE, 
results.get(0).getResultType());
+            return results.get(0).getCharset();
+        }
+    }
+
+    @Test
+    public void testContentTypeHint() throws Exception {
+        // TIKA-4752: the charset claimed via CONTENT_TYPE_HINT (e.g. a zip 
entry's
+        // UTF-8/EFS flag, recorded as text/plain; charset=UTF-8) is consumed.
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain; 
charset=UTF-8");
+        assertEquals(StandardCharsets.UTF_8, detect(m));
+    }
+
+    @Test
+    public void testContentType() throws Exception {
+        Metadata m = new Metadata();
+        // ISO-8859-1 normalizes to its windows-1252 superset (WHATWG), 
existing behavior.
+        m.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+        assertEquals(Charset.forName("windows-1252"), detect(m));
+    }
+
+    @Test
+    public void testContentEncoding() throws Exception {
+        Metadata m = new Metadata();
+        m.set(Metadata.CONTENT_ENCODING, "Shift_JIS");
+        assertEquals(Charset.forName("Shift_JIS"), detect(m));
+    }
+
+    @Test
+    public void testContentTypeWinsOverHint() throws Exception {
+        Metadata m = new Metadata();
+        m.set(Metadata.CONTENT_TYPE, "text/plain; charset=UTF-16");
+        m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain; 
charset=UTF-8");
+        assertEquals(StandardCharsets.UTF_16, detect(m));
+    }
+
+    @Test
+    public void testHintWinsOverContentEncoding() throws Exception {
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.CONTENT_TYPE_HINT, "text/plain; 
charset=UTF-8");
+        m.set(Metadata.CONTENT_ENCODING, "Shift_JIS");
+        assertEquals(StandardCharsets.UTF_8, detect(m));
+    }
+
+    @Test
+    public void testNoDeclarationIsEmpty() throws Exception {
+        assertEquals(null, detect(new Metadata()));
+        // A content-type with no charset parameter is not a declaration.
+        Metadata m = new Metadata();
+        m.set(Metadata.CONTENT_TYPE, "text/plain");
+        assertEquals(null, detect(m));
+        // An unparseable charset label is ignored, not thrown.
+        Metadata bad = new Metadata();
+        bad.set(Metadata.CONTENT_ENCODING, "not-a-charset");
+        assertTrue(detect(bad) == null);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index d01fa3ab41..fe9b5236d6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -22,6 +22,7 @@ import static org.apache.tika.detect.zip.PackageConstants.ZIP;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.attribute.FileTime;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -468,7 +469,7 @@ public class ZipParser extends AbstractArchiveParser {
                                     ZipParserConfig config)
             throws SAXException, IOException, TikaException {
 
-        String name = detectEntryName(entry, parentMetadata, context, config);
+        String name = detectEntryName(entry, context, config);
 
         if (entry.getGeneralPurposeBit().usesEncryption()) {
             handleEncryptedEntry(name, parentMetadata, xhtml);
@@ -513,7 +514,7 @@ public class ZipParser extends AbstractArchiveParser {
                                    ZipParserConfig config)
             throws SAXException, IOException, TikaException {
 
-        String name = detectEntryName(entry, parentMetadata, context, config);
+        String name = detectEntryName(entry, context, config);
 
         if (!zis.canReadEntryData(entry)) {
             if (entry.getGeneralPurposeBit().usesEncryption()) {
@@ -549,22 +550,37 @@ public class ZipParser extends AbstractArchiveParser {
         }
     }
 
-    private String detectEntryName(ZipArchiveEntry entry, Metadata 
parentMetadata,
-                                    ParseContext context, ZipParserConfig 
config) throws IOException {
+    private String detectEntryName(ZipArchiveEntry entry, ParseContext context,
+                                    ZipParserConfig config) throws IOException 
{
         // If user specified an encoding, decode raw bytes with that charset
         // This avoids needing to reopen the ZipFile with a different charset
         if (config.getEntryEncoding() != null) {
             return new String(entry.getRawName(), config.getEntryEncoding());
         }
 
+        // A zip only ever declares a name as UTF-8 (it can't name a legacy 
charset),
+        // two ways. The Unicode extra field carries a CRC-validated UTF-8 
name -- that
+        // CRC check is the evaluation, so trust commons-compress's getName().
+        if (entry.getNameSource() == 
ZipArchiveEntry.NameSource.UNICODE_EXTRA_FIELD) {
+            return entry.getName();
+        }
+
         // If charset detection is enabled, try to detect and decode.
         // Mojibuster handles short inputs natively (zip filenames are often
         // 9-30 bytes); no byte-extension trick needed.
         if (config.isDetectCharsetsInEntryNames()) {
             byte[] entryName = entry.getRawName();
+            // The EFS flag (general purpose bit 11) also declares UTF-8, but 
is
+            // unvalidated. Record it as a content-type hint for the detector 
to
+            // evaluate against the bytes, not trust outright.
+            Metadata nameMetadata = new Metadata();
+            if (entry.getNameSource() == 
ZipArchiveEntry.NameSource.NAME_WITH_EFS_FLAG) {
+                nameMetadata.set(TikaCoreProperties.CONTENT_TYPE_HINT,
+                        new MediaType(MediaType.TEXT_PLAIN, 
StandardCharsets.UTF_8).toString());
+            }
             try (TikaInputStream detectStream = 
TikaInputStream.get(entryName)) {
                 List<EncodingResult> encResults =
-                        getEncodingDetector().detect(detectStream, 
parentMetadata, context);
+                        getEncodingDetector(context).detect(detectStream, 
nameMetadata, context);
                 Charset candidate = encResults.isEmpty() ? null : 
encResults.get(0).getDecodeAs();
                 if (candidate != null) {
                     return new String(entry.getRawName(), candidate);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java
new file mode 100644
index 0000000000..a8fe4e58be
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipEntryNameEncodingTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.CompositeEncodingDetector;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.MetadataCharsetDetector;
+import org.apache.tika.detect.OverrideEncodingDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * TIKA-4752: a zip can only declare an entry name as UTF-8 (never a legacy 
charset),
+ * two ways -- the EFS flag (general purpose bit 11) and the Unicode path 
extra field.
+ * ZipParser must honor both.
+ */
+public class ZipEntryNameEncodingTest extends TikaTest {
+
+    private static final String LATIN = "café-Köln-Süß.txt";
+    private static final String CJK = "日本語.txt";
+
+    @Test
+    public void testEfsFlagHint() throws Exception {
+        // Deterministic + discriminating: MetadataCharsetDetector consumes the
+        // EFS->UTF-8 hint; the override garbles anything it doesn't catch. So 
only the
+        // hint yields UTF-8 -- an empty-returning detector wouldn't isolate 
it, because
+        // ZipParser would fall back to getName(), already UTF-8 for a flagged 
entry.
+        ParseContext context = new ParseContext();
+        context.set(EncodingDetector.class, new 
CompositeEncodingDetector(List.of(
+                new MetadataCharsetDetector(),
+                new 
OverrideEncodingDetector(Charset.forName("windows-1252")))));
+        assertEquals(LATIN, entryName(efsZip(LATIN), context));
+    }
+
+    @Test
+    public void testUnicodeExtraField() throws Exception {
+        // CRC-validated UTF-8 name in the extra field; the main-header name 
is a garbled
+        // CP437 fallback. We must use the extra-field name, not detect the 
raw bytes.
+        assertEquals(CJK, entryName(unicodeExtraFieldZip(CJK), new 
ParseContext()));
+    }
+
+    private String entryName(byte[] zipBytes, ParseContext context) throws 
Exception {
+        try (TikaInputStream tis = TikaInputStream.get(zipBytes)) {
+            List<Metadata> list = getRecursiveMetadata(tis, new Metadata(), 
context, false);
+            assertEquals(2, list.size());
+            return list.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY);
+        }
+    }
+
+    private static byte[] efsZip(String name) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(bos)) {
+            zos.setEncoding("UTF-8");
+            zos.setUseLanguageEncodingFlag(true);
+            
zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER);
+            writeEntry(zos, name);
+        }
+        return bos.toByteArray();
+    }
+
+    private static byte[] unicodeExtraFieldZip(String name) throws IOException 
{
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(bos)) {
+            zos.setEncoding("Cp437");
+            zos.setUseLanguageEncodingFlag(false);
+            
zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.ALWAYS);
+            writeEntry(zos, name);
+        }
+        return bos.toByteArray();
+    }
+
+    private static void writeEntry(ZipArchiveOutputStream zos, String name) 
throws IOException {
+        ZipArchiveEntry entry = new ZipArchiveEntry(name);
+        zos.putArchiveEntry(entry);
+        zos.write("hello".getBytes(StandardCharsets.US_ASCII));
+        zos.closeArchiveEntry();
+    }
+}

(tika) branch main updated: TIKA-4752 -- improve zip name detection (#2869)

Reply via email to