[ https://issues.apache.org/jira/browse/TIKA-4398?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17946339#comment-17946339 ]
Tilman Hausherr edited comment on TIKA-4398 at 4/22/25 8:26 AM: ---------------------------------------------------------------- It worked for me... I didn't use the additional dependencies, maybe retry with a separate project. Here's the code I used, which is slightly different in loading and in the output: {code:java} import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.util.Arrays; import java.util.List; import org.apache.tika.config.ServiceLoader; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.audio.AudioParser; import org.apache.tika.parser.audio.MidiParser; import org.apache.tika.parser.mp3.Mp3Parser; import org.apache.tika.parser.mp4.MP4Parser; import org.apache.tika.parser.pkg.CompressorParser; import org.apache.tika.parser.pkg.RarParser; import org.apache.tika.parser.video.FLVParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class TIKA4398docx_2 extends AutoDetectParser { public static void main(String[] args) throws IOException, URISyntaxException, SAXException, TikaException { List<Class<? extends Parser>> excludeParsers = Arrays.asList( MP4Parser.class, AudioParser.class, Mp3Parser.class, MidiParser.class, FLVParser.class, CompressorParser.class, RarParser.class ); TikaConfig config = TikaConfig.getDefaultConfig(); Parser myParser = new DefaultParser(config.getMediaTypeRegistry(), new ServiceLoader(), excludeParsers); Parser parser = new AutoDetectParser(config.getDetector(), myParser); ContentHandler contentHandler = new BodyContentHandler(); Metadata meta = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); InputStream is = new URI("https://issues.apache.org/jira/secure/attachment/13076074/01.docx").toURL().openStream(); byte[] ba = is.readAllBytes(); InputStream stream = new ByteArrayInputStream(ba); parser.parse(stream, contentHandler, meta, context); System.out.println("Extracted? " + (contentHandler.toString().contains("RESUME") ? "yes" : "no")); for (String name : meta.names()) { if (meta.getValues(name).length > 1) { System.out.println(name + ": " + Arrays.toString(meta.getValues(name))); // get() brings only the first one! } else { System.out.println(name + ": " + meta.get(name)); } } } } {code} The output with 3.1.0: {noformat} Extracted? yes cp:revision: 31 meta:paragraph-count: 1 meta:word-count: 11 extended-properties:Application: WPS Office_10.1.0.7400_F1E327BC-269C-435d-A152-05C5408002CA meta:last-author: WPS_1528193819 X-TIKA:Parsed-By-Full-Set: [org.apache.tika.parser.DefaultParser, org.apache.tika.parser.microsoft.ooxml.OOXMLParser, org.apache.tika.parser.image.ImageParser] dc:creator: Administrator xmpTPg:NPages: 4 dcterms:created: 2017-09-28T08:20:00Z meta:line-count: 1 dcterms:modified: 2018-07-24T04:40:31Z meta:character-count: 68 extended-properties:Template: ?2017???????????????.docx meta:character-count-with-spaces: 78 X-TIKA:Parsed-By: [org.apache.tika.parser.DefaultParser, org.apache.tika.parser.microsoft.ooxml.OOXMLParser] extended-properties:DocSecurityString: None extended-properties:TotalTime: 4 meta:page-count: 4 Content-Type: application/vnd.openxmlformats-officedocument.wordprocessingml.document custom:KSOProductBuildVer: 2052-10.1.0.7400 {noformat} was (Author: tilman): It worked for me... I didn't use the additional dependencies, maybe retry with a separate project. Here's the code I used, which is slightly different in loading and in the output: {code:java} import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.util.Arrays; import java.util.List; import org.apache.tika.config.ServiceLoader; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.audio.AudioParser; import org.apache.tika.parser.audio.MidiParser; import org.apache.tika.parser.mp3.Mp3Parser; import org.apache.tika.parser.mp4.MP4Parser; import org.apache.tika.parser.pkg.CompressorParser; import org.apache.tika.parser.pkg.RarParser; import org.apache.tika.parser.video.FLVParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class TIKA4398docx_2 extends AutoDetectParser { public static void main(String[] args) throws IOException, URISyntaxException, SAXException, TikaException { List<Class<? extends Parser>> excludeParsers = Arrays.asList( MP4Parser.class, AudioParser.class, Mp3Parser.class, MidiParser.class, FLVParser.class, CompressorParser.class, RarParser.class ); TikaConfig config = TikaConfig.getDefaultConfig(); Parser myParser = new DefaultParser(config.getMediaTypeRegistry(), new ServiceLoader(), excludeParsers); Parser parser = new AutoDetectParser(config.getDetector(), myParser); ContentHandler contentHandler = new BodyContentHandler(); Metadata meta = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); InputStream is = new URI("https://issues.apache.org/jira/secure/attachment/13076074/01.docx").toURL().openStream(); byte[] ba = is.readAllBytes(); InputStream stream = new ByteArrayInputStream(ba); parser.parse(stream, contentHandler, meta, context); System.out.println("Extracted? " + (contentHandler.toString().contains("RESUME") ? "yes" : "no")); for (String name : meta.names()) { if (meta.getValues(name).length > 1) { System.out.println(name + ": " + Arrays.toString(meta.getValues(name))); // get() brings only the first one! } else { System.out.println(name + ": " + meta.get(name)); } } } } {code} > When extracting a docx file with Tika 3.1.0, the package parser was detected > instead of the OOXML parser > -------------------------------------------------------------------------------------------------------- > > Key: TIKA-4398 > URL: https://issues.apache.org/jira/browse/TIKA-4398 > Project: Tika > Issue Type: Bug > Components: tika-core > Affects Versions: 3.1.0 > Environment: java17 > Reporter: mannixli > Priority: Major > Attachments: 01.docx, image-2025-04-16-20-46-07-228.png, > image-2025-04-22-11-26-09-936.png, image-2025-04-22-11-27-33-655.png, > image-2025-04-22-11-37-15-401.png > > > 3.0.0 detected ooxml parser -- This message was sent by Atlassian Jira (v8.20.10#820010)