This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d66da4f4e0 TIKA-4752-follow-up (#2871)
d66da4f4e0 is described below
commit d66da4f4e0c8ee23da92d8887f93a4802d6e8a96
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 12:09:11 2026 -0400
TIKA-4752-follow-up (#2871)
---
.../tika/detect/MetadataCharsetDetector.java | 4 +-
.../tika/parser/microsoft/OutlookExtractor.java | 133 ++++++++++-----------
.../java/org/apache/tika/parser/dbf/DBFParser.java | 14 ++-
3 files changed, 78 insertions(+), 73 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
index 1581bc9b74..4ea2a6a771 100644
---
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
+++
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
@@ -40,7 +40,9 @@ import org.apache.tika.parser.ParseContext;
* HTTP/MIME Content-Type header (e.g. {@code text/html;
charset=UTF-8}).</li>
* <li>{@link TikaCoreProperties#CONTENT_TYPE_HINT} — the {@code charset}
parameter
* of a content-type a source <em>claimed</em> for the bytes (e.g. an
HTML
- * {@code <meta>} tag, or a zip entry's UTF-8 (EFS) flag). A hint, not a
verdict.</li>
+ * {@code <meta>} tag, or a zip entry's UTF-8 (EFS) flag). A hint, not a
verdict.
+ * This key is only consulted when {@link MetadataCharsetDetector} is
included in
+ * the active {@link org.apache.tika.detect.EncodingDetector} chain.</li>
* <li>{@link Metadata#CONTENT_ENCODING} — a bare charset label set by
parsers
* such as {@code RFC822Parser}, which splits Content-Type into a bare
* media-type key and a separate charset key.</li>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index c6f4bc8db6..d17bd9edd4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -68,7 +68,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
-import org.apache.tika.detect.CharsetSupersets;
+import org.apache.tika.detect.DefaultEncodingDetector;
+import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -81,14 +82,11 @@ import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.parser.mailcommons.MailDateParser;
import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
import org.apache.tika.parser.microsoft.rtf.RTFParser;
import org.apache.tika.parser.microsoft.rtf.jflex.RTFHtmlDecapsulator;
-import org.apache.tika.parser.txt.CharsetDetector;
-import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -183,7 +181,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
private final DirectoryNode root;
private final MAPIMessage msg;
private final ParseContext parseContext;
- HtmlEncodingDetector detector = new HtmlEncodingDetector();
+ private static final EncodingDetector DEFAULT_ENCODING_DETECTOR = new
DefaultEncodingDetector();
public OutlookExtractor(DirectoryNode root, Metadata metadata,
ParseContext context) throws TikaException {
@@ -848,42 +846,83 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
return;
}
+ // A declared charset (message codepage, else a Content-Type header)
is a hint,
+ // not a verdict -- the detector evaluates it against the raw body
bytes below.
+ String declared = declaredCharset(msg, mainChunks);
+
+ // Detect on the raw body bytes -- the HTML binary chunk if present,
else the
+ // text body. (msg.getHtmlBody() is an already-decoded String, so
detecting its
+ // re-encoded bytes would just report the re-encoding charset.)
+ byte[] body = null;
+ ByteChunk htmlBinary = mainChunks.getHtmlBodyChunkBinary();
+ if (htmlBinary != null && htmlBinary.getValue() != null) {
+ body = htmlBinary.getValue();
+ } else if (mainChunks.getTextBodyChunk() != null) {
+ body = mainChunks.getTextBodyChunk().getRawValue();
+ }
+
+ EncodingDetector encodingDetector =
context.get(EncodingDetector.class);
+ if (encodingDetector == null) {
+ encodingDetector = DEFAULT_ENCODING_DETECTOR;
+ }
+
+ if (body != null && body.length > 0) {
+ Metadata metadata = new Metadata();
+ if (declared != null) {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT,
+ "text/plain; charset=" + declared);
+ }
+ try (TikaInputStream tis = TikaInputStream.get(body)) {
+ List<EncodingResult> results = encodingDetector.detect(tis,
metadata, context);
+ if (!results.isEmpty() && results.get(0).getConfidence() >
0.35f
+ && tryToSet7BitEncoding(msg,
results.get(0).getDecodeAs().name())) {
+ return;
+ }
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+
+ // No body to adjudicate against (or detection abstained): trust the
declaration.
+ if (declared != null) {
+ tryToSet7BitEncoding(msg, declared);
+ }
+ }
+
+ /**
+ * The charset a 7-bit message declares for itself: its codepage property
+ * (MESSAGE_CODEPAGE / INTERNET_CPID), else a {@code charset} on a
Content-Type
+ * header. A hint for the detector, not a verdict. {@code null} if none.
+ */
+ private static String declaredCharset(MAPIMessage msg, Chunks mainChunks) {
Map<MAPIProperty, List<PropertyValue>> props =
mainChunks.getProperties();
if (props != null) {
- // First choice is a codepage property
- for (MAPIProperty prop : new
MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID}) {
+ for (MAPIProperty prop : new
MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE,
+ MAPIProperty.INTERNET_CPID}) {
List<PropertyValue> val = props.get(prop);
if (val != null && val.size() > 0) {
int codepage = ((PropertyValue.LongPropertyValue)
val.get(0)).getValue();
- String encoding = null;
try {
- encoding = CodePageUtil.codepageToEncoding(codepage,
true);
- } catch (UnsupportedEncodingException e) {
- //swallow
- }
- if (tryToSet7BitEncoding(msg, encoding)) {
- return;
+ String encoding =
CodePageUtil.codepageToEncoding(codepage, true);
+ if (encoding != null && Charset.isSupported(encoding))
{
+ return encoding;
+ }
+ } catch (UnsupportedEncodingException |
IllegalArgumentException e) {
+ //swallow, try the next source
}
}
}
}
-
- // Second choice is a charset on a content type header
try {
String[] headers = msg.getHeaders();
- if (headers != null && headers.length > 0) {
- // Look for a content type with a charset
- Pattern p =
Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?",
Pattern.CASE_INSENSITIVE);
-
+ if (headers != null) {
+ Pattern p = Pattern.compile(
+ "Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?",
Pattern.CASE_INSENSITIVE);
for (String header : headers) {
if (header.startsWith("Content-Type")) {
Matcher m = p.matcher(header);
if (m.matches()) {
- // Found it! Tell all the string chunks
- String charset = m.group(1);
- if (tryToSet7BitEncoding(msg, charset)) {
- return;
- }
+ return m.group(1);
}
}
}
@@ -891,49 +930,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
} catch (ChunkNotFoundException e) {
//swallow
}
-
- // Nothing suitable in the headers, try HTML
- // TODO: do we need to replicate this in Tika? If we wind up
- // parsing the html version of the email, this is duplicative??
- // Or do we need to reset the header strings based on the html
- // meta header if there is no other information?
- try {
- String html = msg.getHtmlBody();
- if (html != null && html.length() > 0) {
- Charset charset = null;
- try (TikaInputStream tis =
TikaInputStream.get(html.getBytes(UTF_8))) {
- List<EncodingResult> encResults =
- detector.detect(tis, EMPTY_METADATA, context);
- charset = encResults.isEmpty() ? null :
encResults.get(0).getDecodeAs();
- } catch (IOException e) {
- //swallow
- }
- if (charset != null && tryToSet7BitEncoding(msg,
charset.name())) {
- return;
- }
- }
- } catch (ChunkNotFoundException e) {
- //swallow
- }
-
- //absolute last resort, try charset detector
- StringChunk text = mainChunks.getTextBodyChunk();
- if (text != null) {
- CharsetDetector detector = new CharsetDetector();
- detector.setText(text.getRawValue());
- CharsetMatch match = detector.detect();
- if (match != null && match.getConfidence() > 35) {
- String charsetName = match.getName();
- try {
- charsetName =
CharsetSupersets.decodeAs(Charset.forName(charsetName)).name();
- } catch (IllegalArgumentException e) {
- //ICU name not a resolvable Java charset; use as-is
- }
- if (tryToSet7BitEncoding(msg, charsetName)) {
- return;
- }
- }
- }
+ return null;
}
private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
index f894593be4..c44b0b7be0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFParser.java
@@ -31,6 +31,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.CharsetSupersets;
+import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
@@ -40,7 +41,6 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -57,6 +57,7 @@ public class DBFParser implements Parser {
private static final int ROWS_TO_BUFFER_FOR_CHARSET_DETECTION = 10;
private static final int MAX_CHARS_FOR_CHARSET_DETECTION = 20000;
private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
+ private static final EncodingDetector DEFAULT_ENCODING_DETECTOR = new
DefaultEncodingDetector();
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-dbf"));
@@ -140,11 +141,16 @@ public class DBFParser implements Parser {
}
byte[] bytes = bos.toByteArray();
if (bytes.length > 20) {
- EncodingDetector detector = new Icu4jEncodingDetector();
+ EncodingDetector detector =
parseContext.get(EncodingDetector.class);
+ if (detector == null) {
+ detector = DEFAULT_ENCODING_DETECTOR;
+ }
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
List<EncodingResult> results =
- detector.detect(TikaInputStream.get(bytes), new
Metadata(), parseContext);
- charset = results.isEmpty() ? null :
results.get(0).getCharset();
+ detector.detect(tis, new Metadata(), parseContext);
+ if (!results.isEmpty() && results.get(0).getCharset() != null)
{
+ charset = results.get(0).getCharset();
+ }
}
}
return charset;