This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 127665a5d8 TIKA-4751 - decode as (#2867)
127665a5d8 is described below

commit 127665a5d820e88302ec2c2c017ff4e09e427027
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 08:49:18 2026 -0400

    TIKA-4751 - decode as (#2867)
---
 .../java/org/apache/tika/detect/AutoDetectReader.java    | 11 +++++------
 .../java/org/apache/tika/detect/CharsetSupersets.java    | 11 +++++++++++
 .../main/java/org/apache/tika/detect/EncodingResult.java | 14 ++++++++++++++
 .../java/org/apache/tika/parser/html/JSoupParser.java    |  4 +++-
 .../apache/tika/parser/microsoft/OutlookExtractor.java   | 16 ++++++++++++----
 .../main/java/org/apache/tika/parser/dbf/DBFParser.java  |  9 ++++++---
 .../main/java/org/apache/tika/parser/pkg/ZipParser.java  |  2 +-
 7 files changed, 52 insertions(+), 15 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java 
b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index f17844bf78..0bea81f32d 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -100,13 +100,12 @@ public class AutoDetectReader extends BufferedReader {
         // Ask all given detectors for the character encoding
         List<EncodingResult> results = detector.detect(tis, metadata, new 
ParseContext());
         if (!results.isEmpty()) {
-            Charset detected = results.get(0).getCharset();
-            Charset superset = CharsetSupersets.supersetOf(detected);
-            if (superset != null) {
-                metadata.set(TikaCoreProperties.DECODED_CHARSET, 
superset.name());
-                return superset;
+            EncodingResult result = results.get(0);
+            Charset decodeAs = result.getDecodeAs();
+            if (!decodeAs.equals(result.getCharset())) {
+                metadata.set(TikaCoreProperties.DECODED_CHARSET, 
decodeAs.name());
             }
-            return detected;
+            return decodeAs;
         }
 
         // Try determining the encoding based on hints in document metadata.
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java 
b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
index 88bd5416bb..c9935feb3a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CharsetSupersets.java
@@ -88,4 +88,15 @@ public final class CharsetSupersets {
             return null;
         }
     }
+
+    /**
+     * The charset to decode with: {@link #supersetOf(Charset) superset} of
+     * {@code detected} if one exists, else {@code detected} (null only if
+     * {@code detected} is null). For bare-{@link Charset} callers; detection
+     * results can use {@link EncodingResult#getDecodeAs()}.
+     */
+    public static Charset decodeAs(Charset detected) {
+        Charset superset = supersetOf(detected);
+        return superset != null ? superset : detected;
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
index 55724aefc7..26df7d3b50 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
@@ -126,10 +126,24 @@ public class EncodingResult {
         this.resultType = resultType;
     }
 
+    /**
+     * The detected charset. For <em>decoding</em> bytes prefer {@link 
#getDecodeAs()};
+     * this is the charset to <em>report</em> (Content-Type / 
detected-encoding).
+     */
     public Charset getCharset() {
         return charset;
     }
 
+    /**
+     * The charset to decode with: {@link #getCharset()} widened to its 
superset when
+     * one exists (e.g. GBK &rarr; GB18030), else the detected charset 
unchanged.
+     *
+     * @see CharsetSupersets
+     */
+    public Charset getDecodeAs() {
+        return CharsetSupersets.decodeAs(charset);
+    }
+
     /**
      * Detection confidence in {@code [0.0, 1.0]}.  Meaningful for ranking
      * among {@link ResultType#STATISTICAL} candidates.  For
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index 4e45a8723e..05d8f836e6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -163,6 +163,8 @@ public class JSoupParser extends 
AbstractEncodingDetectorParser {
         List<EncodingResult> encResults = encodingDetector.detect(tis, 
metadata, context);
         Charset charset = encResults.isEmpty() ? DEFAULT_CHARSET
                 : encResults.get(0).getCharset();
+        Charset decodeAs = encResults.isEmpty() ? DEFAULT_CHARSET
+                : encResults.get(0).getDecodeAs();
         String previous = metadata.get(Metadata.CONTENT_TYPE);
         MediaType contentType = null;
         if (previous == null || previous.startsWith("text/html")) {
@@ -195,7 +197,7 @@ public class JSoupParser extends 
AbstractEncodingDetectorParser {
         tis.setCloseShield();
         Document document;
         try {
-            document = Jsoup.parse(tis, charset.name(), "",
+            document = Jsoup.parse(tis, decodeAs.name(), "",
                     Parser.htmlParser().tagSet(tagSet));
         } finally {
             tis.removeCloseShield();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 01b357bddf..c6f4bc8db6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -68,6 +68,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
 
+import org.apache.tika.detect.CharsetSupersets;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -903,7 +904,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 try (TikaInputStream tis = 
TikaInputStream.get(html.getBytes(UTF_8))) {
                     List<EncodingResult> encResults =
                             detector.detect(tis, EMPTY_METADATA, context);
-                    charset = encResults.isEmpty() ? null : 
encResults.get(0).getCharset();
+                    charset = encResults.isEmpty() ? null : 
encResults.get(0).getDecodeAs();
                 } catch (IOException e) {
                     //swallow
                 }
@@ -921,9 +922,16 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
             CharsetDetector detector = new CharsetDetector();
             detector.setText(text.getRawValue());
             CharsetMatch match = detector.detect();
-            if (match != null && match.getConfidence() > 35 &&
-                    tryToSet7BitEncoding(msg, match.getName())) {
-                return;
+            if (match != null && match.getConfidence() > 35) {
+                String charsetName = match.getName();
+                try {
+                    charsetName = 
CharsetSupersets.decodeAs(Charset.forName(charsetName)).name();
+                } catch (IllegalArgumentException e) {
+                    //ICU name not a resolvable Java charset; use as-is
+                }
+                if (tryToSet7BitEncoding(msg, charsetName)) {
+                    return;
+                }
             }
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
index d6e9d8d863..f894593be4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
@@ -30,6 +30,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.config.TikaComponent;
+import org.apache.tika.detect.CharsetSupersets;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.exception.TikaException;
@@ -89,6 +90,8 @@ public class DBFParser implements Parser {
 
         Charset charset = getCharset(firstRows, header, context);
         metadata.set(Metadata.CONTENT_ENCODING, charset.toString());
+        //report detected (above); decode with its superset
+        Charset decodeAs = CharsetSupersets.decodeAs(charset);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, 
context);
         xhtml.startDocument();
@@ -96,7 +99,7 @@ public class DBFParser implements Parser {
         xhtml.startElement("thead");
         for (DBFColumnHeader col : header.getCols()) {
             xhtml.startElement("th");
-            xhtml.characters(col.getName(charset));
+            xhtml.characters(col.getName(decodeAs));
             xhtml.endElement("th");
         }
         xhtml.endElement("thead");
@@ -106,12 +109,12 @@ public class DBFParser implements Parser {
         //now write cached rows
         while (firstRows.size() > 0) {
             DBFRow cachedRow = firstRows.remove(0);
-            writeRow(cachedRow, charset, xhtml);
+            writeRow(cachedRow, decodeAs, xhtml);
         }
 
         //now continue with rest
         while (row != null) {
-            writeRow(row, charset, xhtml);
+            writeRow(row, decodeAs, xhtml);
             row = reader.next();
         }
         xhtml.endElement("tbody");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index 2933200bea..d01fa3ab41 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -565,7 +565,7 @@ public class ZipParser extends AbstractArchiveParser {
             try (TikaInputStream detectStream = 
TikaInputStream.get(entryName)) {
                 List<EncodingResult> encResults =
                         getEncodingDetector().detect(detectStream, 
parentMetadata, context);
-                Charset candidate = encResults.isEmpty() ? null : 
encResults.get(0).getCharset();
+                Charset candidate = encResults.isEmpty() ? null : 
encResults.get(0).getDecodeAs();
                 if (candidate != null) {
                     return new String(entry.getRawName(), candidate);
                 }

Reply via email to