(tika) 01/01: TIKA-4752 -- follow up

tallison Fri, 05 Jun 2026 06:55:33 -0700

This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch TIKA-4752-follow-up
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 723b2f96abe6d5dadf5d0fa7be786b337db1ec8a
Author: tallison <[email protected]>
AuthorDate: Fri Jun 5 09:55:03 2026 -0400

    TIKA-4752 -- follow up
---
 .../tika/parser/microsoft/OutlookExtractor.java    | 133 ++++++++++-----------
 .../java/org/apache/tika/parser/dbf/DBFParser.java |   8 +-
 2 files changed, 71 insertions(+), 70 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index c6f4bc8db6..d17bd9edd4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -68,7 +68,8 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
 
-import org.apache.tika.detect.CharsetSupersets;
+import org.apache.tika.detect.DefaultEncodingDetector;
+import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -81,14 +82,11 @@ import org.apache.tika.metadata.RTFMetadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.html.JSoupParser;
 import org.apache.tika.parser.mailcommons.MailDateParser;
 import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
 import org.apache.tika.parser.microsoft.rtf.RTFParser;
 import org.apache.tika.parser.microsoft.rtf.jflex.RTFHtmlDecapsulator;
-import org.apache.tika.parser.txt.CharsetDetector;
-import org.apache.tika.parser.txt.CharsetMatch;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -183,7 +181,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
     private final DirectoryNode root;
     private final MAPIMessage msg;
     private final ParseContext parseContext;
-    HtmlEncodingDetector detector = new HtmlEncodingDetector();
+    private static final EncodingDetector DEFAULT_ENCODING_DETECTOR = new 
DefaultEncodingDetector();
 
 
     public OutlookExtractor(DirectoryNode root, Metadata metadata, 
ParseContext context) throws TikaException {
@@ -848,42 +846,83 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
             return;
         }
 
+        // A declared charset (message codepage, else a Content-Type header) 
is a hint,
+        // not a verdict -- the detector evaluates it against the raw body 
bytes below.
+        String declared = declaredCharset(msg, mainChunks);
+
+        // Detect on the raw body bytes -- the HTML binary chunk if present, 
else the
+        // text body. (msg.getHtmlBody() is an already-decoded String, so 
detecting its
+        // re-encoded bytes would just report the re-encoding charset.)
+        byte[] body = null;
+        ByteChunk htmlBinary = mainChunks.getHtmlBodyChunkBinary();
+        if (htmlBinary != null && htmlBinary.getValue() != null) {
+            body = htmlBinary.getValue();
+        } else if (mainChunks.getTextBodyChunk() != null) {
+            body = mainChunks.getTextBodyChunk().getRawValue();
+        }
+
+        EncodingDetector encodingDetector = 
context.get(EncodingDetector.class);
+        if (encodingDetector == null) {
+            encodingDetector = DEFAULT_ENCODING_DETECTOR;
+        }
+
+        if (body != null && body.length > 0) {
+            Metadata metadata = new Metadata();
+            if (declared != null) {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT,
+                        "text/plain; charset=" + declared);
+            }
+            try (TikaInputStream tis = TikaInputStream.get(body)) {
+                List<EncodingResult> results = encodingDetector.detect(tis, 
metadata, context);
+                if (!results.isEmpty() && results.get(0).getConfidence() > 
0.35f
+                        && tryToSet7BitEncoding(msg, 
results.get(0).getDecodeAs().name())) {
+                    return;
+                }
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+
+        // No body to adjudicate against (or detection abstained): trust the 
declaration.
+        if (declared != null) {
+            tryToSet7BitEncoding(msg, declared);
+        }
+    }
+
+    /**
+     * The charset a 7-bit message declares for itself: its codepage property
+     * (MESSAGE_CODEPAGE / INTERNET_CPID), else a {@code charset} on a 
Content-Type
+     * header. A hint for the detector, not a verdict. {@code null} if none.
+     */
+    private static String declaredCharset(MAPIMessage msg, Chunks mainChunks) {
         Map<MAPIProperty, List<PropertyValue>> props = 
mainChunks.getProperties();
         if (props != null) {
-            // First choice is a codepage property
-            for (MAPIProperty prop : new 
MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID}) {
+            for (MAPIProperty prop : new 
MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE,
+                    MAPIProperty.INTERNET_CPID}) {
                 List<PropertyValue> val = props.get(prop);
                 if (val != null && val.size() > 0) {
                     int codepage = ((PropertyValue.LongPropertyValue) 
val.get(0)).getValue();
-                    String encoding = null;
                     try {
-                        encoding = CodePageUtil.codepageToEncoding(codepage, 
true);
-                    } catch (UnsupportedEncodingException e) {
-                        //swallow
-                    }
-                    if (tryToSet7BitEncoding(msg, encoding)) {
-                        return;
+                        String encoding = 
CodePageUtil.codepageToEncoding(codepage, true);
+                        if (encoding != null && Charset.isSupported(encoding)) 
{
+                            return encoding;
+                        }
+                    } catch (UnsupportedEncodingException | 
IllegalArgumentException e) {
+                        //swallow, try the next source
                     }
                 }
             }
         }
-
-        // Second choice is a charset on a content type header
         try {
             String[] headers = msg.getHeaders();
-            if (headers != null && headers.length > 0) {
-                // Look for a content type with a charset
-                Pattern p = 
Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", 
Pattern.CASE_INSENSITIVE);
-
+            if (headers != null) {
+                Pattern p = Pattern.compile(
+                        "Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", 
Pattern.CASE_INSENSITIVE);
                 for (String header : headers) {
                     if (header.startsWith("Content-Type")) {
                         Matcher m = p.matcher(header);
                         if (m.matches()) {
-                            // Found it! Tell all the string chunks
-                            String charset = m.group(1);
-                            if (tryToSet7BitEncoding(msg, charset)) {
-                                return;
-                            }
+                            return m.group(1);
                         }
                     }
                 }
@@ -891,49 +930,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         } catch (ChunkNotFoundException e) {
             //swallow
         }
-
-        // Nothing suitable in the headers, try HTML
-        // TODO: do we need to replicate this in Tika? If we wind up
-        // parsing the html version of the email, this is duplicative??
-        // Or do we need to reset the header strings based on the html
-        // meta header if there is no other information?
-        try {
-            String html = msg.getHtmlBody();
-            if (html != null && html.length() > 0) {
-                Charset charset = null;
-                try (TikaInputStream tis = 
TikaInputStream.get(html.getBytes(UTF_8))) {
-                    List<EncodingResult> encResults =
-                            detector.detect(tis, EMPTY_METADATA, context);
-                    charset = encResults.isEmpty() ? null : 
encResults.get(0).getDecodeAs();
-                } catch (IOException e) {
-                    //swallow
-                }
-                if (charset != null && tryToSet7BitEncoding(msg, 
charset.name())) {
-                    return;
-                }
-            }
-        } catch (ChunkNotFoundException e) {
-            //swallow
-        }
-
-        //absolute last resort, try charset detector
-        StringChunk text = mainChunks.getTextBodyChunk();
-        if (text != null) {
-            CharsetDetector detector = new CharsetDetector();
-            detector.setText(text.getRawValue());
-            CharsetMatch match = detector.detect();
-            if (match != null && match.getConfidence() > 35) {
-                String charsetName = match.getName();
-                try {
-                    charsetName = 
CharsetSupersets.decodeAs(Charset.forName(charsetName)).name();
-                } catch (IllegalArgumentException e) {
-                    //ICU name not a resolvable Java charset; use as-is
-                }
-                if (tryToSet7BitEncoding(msg, charsetName)) {
-                    return;
-                }
-            }
-        }
+        return null;
     }
 
     private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
index f894593be4..f434566f2b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
@@ -31,6 +31,7 @@ import org.xml.sax.SAXException;
 
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.detect.CharsetSupersets;
+import org.apache.tika.detect.DefaultEncodingDetector;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.exception.TikaException;
@@ -40,7 +41,6 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.txt.Icu4jEncodingDetector;
 import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
@@ -57,6 +57,7 @@ public class DBFParser implements Parser {
     private static final int ROWS_TO_BUFFER_FOR_CHARSET_DETECTION = 10;
     private static final int MAX_CHARS_FOR_CHARSET_DETECTION = 20000;
     private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
+    private static final EncodingDetector DEFAULT_ENCODING_DETECTOR = new 
DefaultEncodingDetector();
 
     private static final Set<MediaType> SUPPORTED_TYPES =
             Collections.singleton(MediaType.application("x-dbf"));
@@ -140,7 +141,10 @@ public class DBFParser implements Parser {
         }
         byte[] bytes = bos.toByteArray();
         if (bytes.length > 20) {
-            EncodingDetector detector = new Icu4jEncodingDetector();
+            EncodingDetector detector = 
parseContext.get(EncodingDetector.class);
+            if (detector == null) {
+                detector = DEFAULT_ENCODING_DETECTOR;
+            }
             try (TikaInputStream tis = TikaInputStream.get(bytes)) {
                 List<EncodingResult> results =
                         detector.detect(TikaInputStream.get(bytes), new 
Metadata(), parseContext);

(tika) 01/01: TIKA-4752 -- follow up

Reply via email to