[ https://issues.apache.org/jira/browse/TIKA-4398?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17946339#comment-17946339 ]
Tilman Hausherr commented on TIKA-4398: --------------------------------------- It worked for me... I didn't use the additional dependencies, maybe retry with a separate project. Here's the code I used, which is slightly different in loading and in the output: {code:java} import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.util.Arrays; import java.util.List; import org.apache.tika.config.ServiceLoader; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.audio.AudioParser; import org.apache.tika.parser.audio.MidiParser; import org.apache.tika.parser.mp3.Mp3Parser; import org.apache.tika.parser.mp4.MP4Parser; import org.apache.tika.parser.pkg.CompressorParser; import org.apache.tika.parser.pkg.RarParser; import org.apache.tika.parser.video.FLVParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class TIKA4398docx_2 extends AutoDetectParser { public static void main(String[] args) throws IOException, URISyntaxException, SAXException, TikaException { List<Class<? extends Parser>> excludeParsers = Arrays.asList( MP4Parser.class, AudioParser.class, Mp3Parser.class, MidiParser.class, FLVParser.class, CompressorParser.class, RarParser.class ); TikaConfig config = TikaConfig.getDefaultConfig(); Parser myParser = new DefaultParser(config.getMediaTypeRegistry(), new ServiceLoader(), excludeParsers); Parser parser = new AutoDetectParser(config.getDetector(), myParser); ContentHandler contentHandler = new BodyContentHandler(); Metadata meta = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); InputStream is = new URI("https://issues.apache.org/jira/secure/attachment/13076074/01.docx").toURL().openStream(); byte[] ba = is.readAllBytes(); InputStream stream = new ByteArrayInputStream(ba); parser.parse(stream, contentHandler, meta, context); System.out.println("Extracted? " + (contentHandler.toString().contains("RESUME") ? "yes" : "no")); for (String name : meta.names()) { if (meta.getValues(name).length > 1) { System.out.println(name + ": " + Arrays.toString(meta.getValues(name))); // get() brings only the first one! } else { System.out.println(name + ": " + meta.get(name)); } } } } {code} > When extracting a docx file with Tika 3.1.0, the package parser was detected > instead of the OOXML parser > -------------------------------------------------------------------------------------------------------- > > Key: TIKA-4398 > URL: https://issues.apache.org/jira/browse/TIKA-4398 > Project: Tika > Issue Type: Bug > Components: tika-core > Affects Versions: 3.1.0 > Environment: java17 > Reporter: mannixli > Priority: Major > Attachments: 01.docx, image-2025-04-16-20-46-07-228.png, > image-2025-04-22-11-26-09-936.png, image-2025-04-22-11-27-33-655.png, > image-2025-04-22-11-37-15-401.png > > > 3.0.0 detected ooxml parser -- This message was sent by Atlassian Jira (v8.20.10#820010)