This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4752 in repository https://gitbox.apache.org/repos/asf/tika.git
commit eff4f458bb73268ff6dd42f0067e51795e85a0c9 Merge: 652ffb27b2 ffd712980b Author: tallison <[email protected]> AuthorDate: Fri Jun 5 09:12:26 2026 -0400 merge main .../org/apache/tika/detect/AutoDetectReader.java | 11 +++-- .../org/apache/tika/detect/CharsetSupersets.java | 11 +++++ .../org/apache/tika/detect/EncodingResult.java | 14 +++++++ .../org/apache/tika/parser/html/JSoupParser.java | 4 +- .../tika/parser/microsoft/OutlookExtractor.java | 16 +++++-- .../java/org/apache/tika/parser/dbf/DBFParser.java | 9 ++-- .../java/org/apache/tika/parser/pkg/ZipParser.java | 2 +- .../tika/serialization/ComponentNameResolver.java | 49 ++++++++++++++++++++-- .../serialization/ComponentNameResolverTest.java | 45 ++++++++++++++++++++ 9 files changed, 142 insertions(+), 19 deletions(-) diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java index cb0c52208a,d01fa3ab41..fe9b5236d6 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java @@@ -570,18 -562,10 +570,18 @@@ public class ZipParser extends Abstract // 9-30 bytes); no byte-extension trick needed. if (config.isDetectCharsetsInEntryNames()) { byte[] entryName = entry.getRawName(); + // The EFS flag (general purpose bit 11) also declares UTF-8, but is + // unvalidated. Record it as a content-type hint for the detector to + // evaluate against the bytes, not trust outright. + Metadata nameMetadata = new Metadata(); + if (entry.getNameSource() == ZipArchiveEntry.NameSource.NAME_WITH_EFS_FLAG) { + nameMetadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, + new MediaType(MediaType.TEXT_PLAIN, StandardCharsets.UTF_8).toString()); + } try (TikaInputStream detectStream = TikaInputStream.get(entryName)) { List<EncodingResult> encResults = - getEncodingDetector().detect(detectStream, parentMetadata, context); + getEncodingDetector(context).detect(detectStream, nameMetadata, context); - Charset candidate = encResults.isEmpty() ? null : encResults.get(0).getCharset(); + Charset candidate = encResults.isEmpty() ? null : encResults.get(0).getDecodeAs(); if (candidate != null) { return new String(entry.getRawName(), candidate); }
