[ https://issues.apache.org/jira/browse/TIKA-2208?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15761055#comment-15761055 ]
Tim Allison edited comment on TIKA-2208 at 12/19/16 12:54 PM: -------------------------------------------------------------- Three cheers for unit tests! It looks like we need to add vnd.ms-powerpoint.template.macroenabled.12 to OOXMLParser's handled media types. I'll make that change shortly. Meanwhile, you could try something like this, which runs against nearly all of our test documents: {noformat} private static final Set<MediaType> INCLUDES = new HashSet<>(); static { for (MediaType mediaType : OOXMLParser.SUPPORTED_TYPES) { if (mediaType.equals(MediaType.application("x-tika-ooxml"))) { continue; } INCLUDES.add(mediaType); } INCLUDES.add(MediaType.application("vnd.ms-powerpoint.template.macroenabled.12")); } private static final Set<MediaType> EXCLUDES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( MediaType.application("x-tika-ooxml") ))); private static final Parser DECORATED_PARSERS[] = new Parser[] { // documents new org.apache.tika.parser.html.HtmlParser(), new org.apache.tika.parser.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), ParserDecorator.withTypes( ParserDecorator.withoutTypes( new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES ), INCLUDES), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), new org.apache.tika.parser.epub.EpubParser(), }; private static final Parser STANDARD_PARSERS[] = new Parser[] { // documents new org.apache.tika.parser.html.HtmlParser(), new org.apache.tika.parser.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), new org.apache.tika.parser.epub.EpubParser(), }; private static final AutoDetectParser DECORATED_PARSER_INSTANCE = new AutoDetectParser(DECORATED_PARSERS); private static final AutoDetectParser STANDARD_PARSER_INSTANCE = new AutoDetectParser(STANDARD_PARSERS); private static final Tika DECORATED_TIKA = new Tika(DECORATED_PARSER_INSTANCE.getDetector(), DECORATED_PARSER_INSTANCE); private static final Tika STANDARD_TIKA = new Tika(STANDARD_PARSER_INSTANCE.getDetector(), STANDARD_PARSER_INSTANCE); @Test public void testSkipVisioOOXML() throws Exception { for (File f : getResourceAsFile("/test-documents").listFiles()) { if (f.isDirectory()) { continue; } if (f.getName().contains("VISIO") && (f.getName().endsWith("x") || f.getName().endsWith("m"))) { continue; } if (f.getName().contains("embeddedVsdx")) { continue; } boolean decoratedEx = false; boolean standardEx = false; String decoratedOutput = ""; String standardOutput = ""; try (InputStream is = TikaInputStream.get(f)) { decoratedOutput = DECORATED_TIKA.parseToString(is); } catch (Throwable e) { decoratedEx = true; } try (InputStream is = TikaInputStream.get(f)) { standardOutput = STANDARD_TIKA.parseToString(is); } catch (Throwable e) { standardEx = true; } assertEquals(f.getName(), standardEx, decoratedEx); if (standardEx == false) { assertEquals(f.getName(), standardOutput, decoratedOutput); } } } {noformat} was (Author: talli...@mitre.org): Three cheers for unit tests! It looks like we need to add vnd.ms-powerpoint.template.macroenabled.12 to OOXMLParser's handled media types. I'll make that change shortly. Meanwhile, you could try something like this, which runs against nearly all of our test documents: {noformat} private static final Set<MediaType> INCLUDES = new HashSet<>(); static { for (MediaType mediaType : OOXMLParser.SUPPORTED_TYPES) { if (mediaType.equals(MediaType.application("x-tika-ooxml"))) { continue; } INCLUDES.add(mediaType); } INCLUDES.add(MediaType.application("vnd.ms-powerpoint.template.macroenabled.12")); } private static final Set<MediaType> EXCLUDES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( MediaType.application("x-tika-ooxml") ))); private static final Parser DECORATED_PARSERS[] = new Parser[] { // documents new org.apache.tika.parser.html.HtmlParser(), new org.apache.tika.parser.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), ParserDecorator.withTypes(ParserDecorator.withoutTypes( new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES ), INCLUDES), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), new org.apache.tika.parser.epub.EpubParser(), }; private static final Parser STANDARD_PARSERS[] = new Parser[] { // documents new org.apache.tika.parser.html.HtmlParser(), new org.apache.tika.parser.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), new org.apache.tika.parser.epub.EpubParser(), }; private static final AutoDetectParser DECORATED_PARSER_INSTANCE = new AutoDetectParser(DECORATED_PARSERS); private static final AutoDetectParser STANDARD_PARSER_INSTANCE = new AutoDetectParser(STANDARD_PARSERS); private static final Tika DECORATED_TIKA = new Tika(DECORATED_PARSER_INSTANCE.getDetector(), DECORATED_PARSER_INSTANCE); private static final Tika STANDARD_TIKA = new Tika(STANDARD_PARSER_INSTANCE.getDetector(), STANDARD_PARSER_INSTANCE); @Test public void testSkipVisioOOXML() throws Exception { for (File f : getResourceAsFile("/test-documents").listFiles()) { if (f.isDirectory()) { continue; } if (f.getName().contains("VISIO") && (f.getName().endsWith("x") || f.getName().endsWith("m"))) { continue; } if (f.getName().contains("embeddedVsdx")) { continue; } boolean decoratedEx = false; boolean standardEx = false; String decoratedOutput = ""; String standardOutput = ""; try (InputStream is = TikaInputStream.get(f)) { decoratedOutput = DECORATED_TIKA.parseToString(is); } catch (Throwable e) { decoratedEx = true; } try (InputStream is = TikaInputStream.get(f)) { standardOutput = STANDARD_TIKA.parseToString(is); } catch (Throwable e) { standardEx = true; } assertEquals(f.getName(), standardEx, decoratedEx); if (standardEx == false) { assertEquals(f.getName(), standardOutput, decoratedOutput); } } } {noformat} > Catch missing libraires > ----------------------- > > Key: TIKA-2208 > URL: https://issues.apache.org/jira/browse/TIKA-2208 > Project: Tika > Issue Type: Improvement > Components: parser > Reporter: David Pilato > > Hi there > We have decided to remove support for some formats when using Tika to extract > text and metadata. > We defined our list of Parsers: > {code:java} > private static final Parser PARSERS[] = new Parser[] { > // documents > new org.apache.tika.parser.html.HtmlParser(), > new org.apache.tika.parser.rtf.RTFParser(), > new org.apache.tika.parser.pdf.PDFParser(), > new org.apache.tika.parser.txt.TXTParser(), > new org.apache.tika.parser.microsoft.OfficeParser(), > new org.apache.tika.parser.microsoft.OldExcelParser(), > new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), > new org.apache.tika.parser.odf.OpenDocumentParser(), > new org.apache.tika.parser.iwork.IWorkPackageParser(), > new org.apache.tika.parser.xml.DcXMLParser(), > new org.apache.tika.parser.epub.EpubParser(), > }; > private static final AutoDetectParser PARSER_INSTANCE = new > AutoDetectParser(PARSERS); > private static final Tika TIKA_INSTANCE = new > Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE); > {code} > But when a MS Office Word document embeds another non supported document > (Like a Visio Schema) an {{NoClassDefFoundError}} is raised. > Would it be possible to catch such a case and throw in that case a > {{TikaException}} so it behaves as an Exception and not as a Throwable? -- This message was sent by Atlassian JIRA (v6.3.4#6332)