[ https://issues.apache.org/jira/browse/TIKA-4459?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18010647#comment-18010647 ]
Manish S N edited comment on TIKA-4459 at 7/29/25 12:30 PM: ------------------------------------------------------------ I did a performance test with spooling for every file and normal the results are {code:java} 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: true, fileCount: 23, meanTime: 5.96, stdDeviation: 3.78, minTime: 2.0, maxTime: 13.0 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: false, fileCount: 23, meanTime: 5.13, stdDeviation: 4.81, minTime: 1.0, maxTime: 17.0 {code} I only walked all the files in my document folder and my machine has an ssd so the results might be influenced by that. but it seems the spooling variant performed better. I attach the performance test code below so you can try it on your test files folder {code:java} public class PerfTest { //this class is to test performance difference between spooling and not spooling input stream public static final Path TEST_FOLDER = Path.of("/path/redacted/for/privacy"); static Logger log = LoggerFactory.getLogger(PerfTest.class); static DescriptiveStatistics spoolingStats = new DescriptiveStatistics(); static DescriptiveStatistics nonSpoolingStats = new DescriptiveStatistics(); public static void main(String[] args) { testSpoolingPerf(); } static Tika parser = new Tika(); private static String parse(InputStream fileStream, Metadata meta) throws TikaException, IOException { return parser.parseToString(fileStream, meta); } private static void testSpoolingPerf() { // This method walks the files in the folder and calls parse path with and without spooling try (Stream<Path> paths = Files.walk(TEST_FOLDER)) { paths.filter(Files::isRegularFile).forEach(PerfTest::testPath); } catch (IOException e) { e.printStackTrace(); } logStats(spoolingStats, true); logStats(nonSpoolingStats, false); } private static void logStats(DescriptiveStatistics stats, boolean spooling) { log.info("#$# spooling: {}, fileCount: {}, meanTime: {}, stdDeviation: {}, minTime: {}, maxTime: {}", spooling, stats.getN(), String.format("%.2f",stats.getMean()), String.format("%.2f", stats.getStandardDeviation()), stats.getMin(), stats.getMax()); } private static void testPath(Path file){ parsePath(file, true, null); //for warmup parsePath(file, false, null); //for warmup log.info("Warmup completed"); parsePath(file,true, spoolingStats); parsePath(file,false, nonSpoolingStats); } private static void parsePath(Path filePath, boolean spooling, DescriptiveStatistics stats) { long startTime = System.currentTimeMillis(); long size =-1; long parsedSize = -1; try (InputStream inputStream = TikaInputStream.get(Files.newInputStream(filePath))) { log.debug("\n parsing file: {}", filePath.getFileName()); Metadata meta = new Metadata(); meta.set("resourceName", filePath.getFileName().toString()); meta.set("size", String.valueOf(Files.size(filePath))); TikaInputStream tis = TikaInputStream.get( inputStream); if(spooling){ tis.getPath(); } String content = parse(tis, meta); size = Files.size(filePath); parsedSize = content.length(); log.debug("File: {} , Type: {} , Size: {} , ParsedSize: {}", filePath.getFileName(), meta.get("Content-Type"), size, parsedSize); log.debug("Content:\n{}", content); } catch (EncryptedDocumentException e) { log.info("File is encrypted: {}", filePath.getFileName()); } catch (UnsupportedFormatException e) { log.info("Unsupported format for file: {}", filePath.getFileName()); } catch (TikaException e) { Throwable cause = e.getCause(); while (cause != null) { if (cause instanceof PasswordRequiredException) { log.info("File is encrypted: {}", filePath.getFileName()); return; } cause = cause.getCause(); } log.error("Error processing file: {}", filePath.getFileName(),e); } catch (IOException e) { log.error("Error processing file: {}", filePath.getFileName(), e); } long timeTaken = System.currentTimeMillis() - startTime; if(stats!=null){ stats.addValue(timeTaken); } log.info("#@# spool: {} , file: {} , size: {} , parsedSize: {} , time taken: {}", spooling, filePath.getFileName(), size, parsedSize, timeTaken); } } {code} was (Author: JIRAUSER306563): I did a performance test with spooling for every file and normal the results are {code:java} 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: true, fileCount: 23, meanTime: 5.96, stdDeviation: 3.78, minTime: 2.0, maxTime: 13.0 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: false, fileCount: 23, meanTime: 5.13, stdDeviation: 4.81, minTime: 1.0, maxTime: 17.0 {code} I only walked all the files in my document folder and my machine has an ssd so the results might be influenced by that. but it seems the spooling variant performed better. I attach the performance test code below so you can try it on your test files folder {code:java} public class PerfTest { //this class is to test performance difference between spooling and not spooling input stream public static final Path TEST_FOLDER = Path.of("/path/redacted/for/privacy"); static Logger log = LoggerFactory.getLogger(PerfTest.class); static DescriptiveStatistics spoolingStats = new DescriptiveStatistics(); static DescriptiveStatistics nonSpoolingStats = new DescriptiveStatistics(); public static void main(String[] args) { testSpoolingPerf(); } static Tika parser = new Tika(); private static String parse(InputStream fileStream, Metadata meta) throws TikaException, IOException { return parser.parseToString(fileStream, meta); } private static void testSpoolingPerf() { // Implement the logic for spooling parser // This method should handle the input stream and parse it with spooling try (Stream<Path> paths = Files.walk(TEST_FOLDER)) { paths.filter(Files::isRegularFile).forEach(PerfTest::testPath); } catch (IOException e) { e.printStackTrace(); } logStats(spoolingStats, true); logStats(nonSpoolingStats, false); } private static void logStats(DescriptiveStatistics stats, boolean spooling) { log.info("#$# spooling: {}, fileCount: {}, meanTime: {}, stdDeviation: {}, minTime: {}, maxTime: {}", spooling, stats.getN(), String.format("%.2f",stats.getMean()), String.format("%.2f", stats.getStandardDeviation()), stats.getMin(), stats.getMax()); } private static void testPath(Path file){ parsePath(file, true, null); //for warmup parsePath(file, false, null); //for warmup log.info("Warmup completed"); parsePath(file,true, spoolingStats); parsePath(file,false, nonSpoolingStats); } private static void parsePath(Path filePath, boolean spooling, DescriptiveStatistics stats) { long startTime = System.currentTimeMillis(); long size =-1; long parsedSize = -1; try (InputStream inputStream = TikaInputStream.get(Files.newInputStream(filePath))) { log.debug("\n parsing file: {}", filePath.getFileName()); Metadata meta = new Metadata(); meta.set("resourceName", filePath.getFileName().toString()); meta.set("size", String.valueOf(Files.size(filePath))); TikaInputStream tis = TikaInputStream.get( inputStream); if(spooling){ tis.getPath(); } String content = parse(tis, meta); size = Files.size(filePath); parsedSize = content.length(); log.debug("File: {} , Type: {} , Size: {} , ParsedSize: {}", filePath.getFileName(), meta.get("Content-Type"), size, parsedSize); log.debug("Content:\n{}", content); } catch (EncryptedDocumentException e) { log.info("File is encrypted: {}", filePath.getFileName()); } catch (UnsupportedFormatException e) { log.info("Unsupported format for file: {}", filePath.getFileName()); } catch (TikaException e) { Throwable cause = e.getCause(); while (cause != null) { if (cause instanceof PasswordRequiredException) { log.info("File is encrypted: {}", filePath.getFileName()); return; } cause = cause.getCause(); } log.error("Error processing file: {}", filePath.getFileName(),e); } catch (IOException e) { log.error("Error processing file: {}", filePath.getFileName(), e); } long timeTaken = System.currentTimeMillis() - startTime; if(stats!=null){ stats.addValue(timeTaken); } log.info("#@# spool: {} , file: {} , size: {} , parsedSize: {} , time taken: {}", spooling, filePath.getFileName(), size, parsedSize, timeTaken); } } {code} > protected ODF encryption detection fail > --------------------------------------- > > Key: TIKA-4459 > URL: https://issues.apache.org/jira/browse/TIKA-4459 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 3.2.1 > Environment: Ubuntu 24.04.2 LTS x86_64 > Reporter: Manish S N > Priority: Minor > Labels: encryption, odf, open-document-format, protected, > regression, zip > Fix For: 4.0.0, 3.2.2 > > Attachments: protected.odt, testProtected.odp > > > When passing inputstream of protected odf format file to tika we get a > ZipException instead of a EncryptedDocumentException. > This works well and correctly throws EncryptedDocumentException if you create > TikaInputStream with Path or call TikaInputStream.getPath() as it will write > to a temporary file in memory. > But when working with InputStreams we get the following zip exception: > > org.apache.tika.exception.TikaException: TIKA-198: Illegal IOException from > org.apache.tika.parser.odf.OpenDocumentParser@bae47a0 > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:304) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) > at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:204) > at org.apache.tika.Tika.parseToString(Tika.java:525) > at org.apache.tika.Tika.parseToString(Tika.java:495) > at org.manish.AttachmentParser.parse(AttachmentParser.java:21) > at org.manish.AttachmentParser.lambda$testParse$1(AttachmentParser.java:72) > at > java.base/java.util.stream.ForEachOps$ForEachOp$OfRef.accept(ForEachOps.java:183) > at > java.base/java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:177) > at > java.base/java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:195) > at java.base/java.util.Iterator.forEachRemaining(Iterator.java:133) > at > java.base/java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801) > at > java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:484) > at > java.base/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) > at > java.base/java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150) > at > java.base/java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173) > at > java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) > at > java.base/java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:497) > at org.manish.AttachmentParser.testParse(AttachmentParser.java:64) > at org.manish.AttachmentParser.main(AttachmentParser.java:57) > Caused by: java.util.zip.ZipException: only DEFLATED entries can have EXT > descriptor > at java.base/java.util.zip.ZipInputStream.readLOC(ZipInputStream.java:313) > at > java.base/java.util.zip.ZipInputStream.getNextEntry(ZipInputStream.java:125) > at > org.apache.tika.parser.odf.OpenDocumentParser.handleZipStream(OpenDocumentParser.java:218) > at > org.apache.tika.parser.odf.OpenDocumentParser.parse(OpenDocumentParser.java:169) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) > ... 19 more > > (We use tika to detect encrypted docs) -- This message was sent by Atlassian Jira (v8.20.10#820010)