Hi I am trying to create a very simple boiler pipe parser for Tika
The scala class is class BoilerPipeParser(val extractor: BoilerpipeExtractor) extends HtmlParser { def this() = this(DefaultExtractor.INSTANCE) override def parse(stream: InputStream, handler: ContentHandler, metadata: Metadata, context: ParseContext) { super.parse(stream, new BoilerpipeContentHandler(handler, extractor), metadata, context) } } I then use the Tika class parse having set my Tika istance having defined the following AutoDetectParser val generic = new AutoDetectParser new AutoDetectParser( generic.getDetector, generic, ParserDecorator.withTypes(BoilerPipeParser.defaultExtractor, htmlMediaTypes) ) Invoking: tika.parse(stream, meta) generates the following exception [ERROR] [07/27/2012 14:49:04.189] [pipeline-akka.actor.default-dispatcher-6] [akka://pipeline/user/$c] java.io.IOException: at org.apache.tika.parser.ParsingReader.read(ParsingReader.java:260) at java.io.BufferedReader.fill(BufferedReader.java:154) at java.io.BufferedReader.readLine(BufferedReader.java:317) at java.io.BufferedReader.readLine(BufferedReader.java:382) at com.eligotech.common.io.package$.readLine(package.scala:74) at com.eligotech.common.io.package$.readLines(package.scala:79) at com.eligotech.common.io.package$.readAllLines(package.scala:84) at com.eligotech.samiksa.enhancers.tika.TextExtractor.com<http://com.eligotech.samiksa.enhancers.tika.textextractor.com/> $eligotech$samiksa$enhancers$tika$TextExtractor$$extractText(Te xtExtractor.scala:28) at com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1$$anonfun$apply$2.apply(TextExtractor.scala: 23) at com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1$$anonfun$apply$2.apply(TextExtractor.scala: 23) at scala.util.control.Exception$Catch$$anonfun$either$1.apply(Exception.scala:110) at scala.util.control.Exception$Catch$$anonfun$either$1.apply(Exception.scala:110) at scala.util.control.Exception$Catch.apply(Exception.scala:88) at scala.util.control.Exception$Catch.either(Exception.scala:110) at com.eligotech.samiksa.package$.expect(package.scala:17) at com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1.apply(TextExtractor.scala:23) at com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1.apply(TextExtractor.scala:22) at com.eligotech.common.package$.closing(package.scala:7) at com.eligotech.samiksa.enhancers.tika.TextExtractor.extractText(TextExtractor.scala:22) at com.eligotech.samiksa.enhancers.tika.TextExtractor.apply(TextExtractor.scala:16) at com.eligotech.samiksa.enhancers.tika.TextExtractor.apply(TextExtractor.scala:15) at com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$$anonfun$process$1.apply(AkkaPipelinesBuilder.scala:98) at com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$$anonfun$process$1.apply(AkkaPipelinesBuilder.scala:98) at com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$AkkaProcessor$$anonfun$receive$1.apply(AkkaPipelinesBuilder .scala:40) at com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$AkkaProcessor$$anonfun$receive$1.apply(AkkaPipelinesBuilder .scala:39) at akka.actor.Actor$class.apply(Actor.scala:318) at com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$AkkaProcessor.apply(AkkaPipelinesBuilder.scala:25) at akka.actor.ActorCell.invoke(ActorCell.scala:626) at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:197) at akka.dispatch.Mailbox.run(Mailbox.scala:179) at akka.dispatch.ForkJoinExecutorConfigurator$MailboxExecutionTask.exec(AbstractDispatcher.scala:516) at akka.jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:259) at akka.jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:975) at akka.jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1479) at akka.jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104) Caused by: org.apache.tika.exception.TikaException: Zip bomb detected! at org.apache.tika.sax.SecureContentHandler.throwIfCauseOf(SecureContentHandler.java:192) at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:123) at org.apache.tika.parser.ParsingReader$ParsingTask.run(ParsingReader.java:221) at java.lang.Thread.run(Thread.java:722) Caused by: org.apache.tika.sax.SecureContentHandler$SecureSAXException: Suspected zip bomb: 100 levels of XML element nesting at org.apache.tika.sax.SecureContentHandler.startElement(SecureContentHandler.java:234) at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126) at org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:205) at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126) at org.apache.tika.sax.SafeContentHandler.startElement(SafeContentHandler.java:264) at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:245) at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:275) at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:169) at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:129) at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126) at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61) at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794) at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061) at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016) at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:567) at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449) at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:104) at com.eligotech.samiksa.enhancers.tika.BoilerPipeParser.parse(BoilerPipeParser.scala:18) at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91) at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) ... 2 more Did I miss a point ? Kind regards Mark