This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch 4x-reg-epub-fixes in repository https://gitbox.apache.org/repos/asf/tika.git
commit e7f2184a34194d66f0e00697bdab733a58b92323 Author: tallison <[email protected]> AuthorDate: Mon Apr 13 11:49:46 2026 -0400 update epub along the lines of oodt --- .../org/apache/tika/parser/epub/EpubParser.java | 133 ++------------------- .../apache/tika/parser/epub/EpubParserTest.java | 7 +- 2 files changed, 14 insertions(+), 126 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index 9779008017..ae131407d0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -22,7 +22,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; -import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -35,10 +34,8 @@ import java.util.Map; import java.util.Set; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; -import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.commons.lang3.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; @@ -52,8 +49,6 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; -import org.apache.tika.io.FilenameUtils; -import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -65,9 +60,7 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.utils.ParserUtils; import org.apache.tika.utils.XMLReaderUtils; -import org.apache.tika.zip.utils.ZipSalvager; /** * Epub parser @@ -146,54 +139,28 @@ public class EpubParser implements Parser { private Set<String> bufferedParse(TikaInputStream tis, ContentHandler bodyHandler, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { + // DefaultZipContainerDetector opens (and salvages, if needed) the ZipFile and + // stashes it on the TikaInputStream. Reuse it when present; otherwise open ourselves. if (tis.getOpenContainer() instanceof ZipFile) { - return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata, context, true); + return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata, context); } - ZipFile zipFile = null; - try { - zipFile = ZipFile.builder().setFile(tis.getPath().toFile()).get(); - } catch (IOException e) { - ParserUtils.recordParserFailure(this, e, metadata); - return trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context); - } - - try { - return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true); - } finally { - zipFile.close(); - } - } - - private Set<String> trySalvage(Path brokenZip, ContentHandler bodyHandler, - XHTMLContentHandler xhtml, - Metadata metadata, ParseContext context) - throws IOException, TikaException, SAXException { - try (TemporaryResources resources = new TemporaryResources()) { - Path salvaged = - resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString())); - ZipSalvager.salvageCopy(brokenZip, salvaged); - try (ZipFile zipFile = ZipFile.builder().setFile(salvaged.toFile()).get()) { - return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false); - } catch (EpubZipException e) { - try (TikaInputStream tis = TikaInputStream.get(salvaged)) { - return streamingParse(tis, xhtml, metadata, context); - } - } + try (ZipFile zipFile = ZipFile.builder().setFile(tis.getPath().toFile()).get()) { + return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context); } } private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler, XHTMLContentHandler xhtml, Metadata metadata, - ParseContext context, boolean isStrict) - throws IOException, TikaException, SAXException, EpubZipException { + ParseContext context) + throws IOException, TikaException, SAXException { String rootOPF = getRoot(zipFile, context); if (rootOPF == null) { - throw new EpubZipException(); + return Collections.EMPTY_SET; } ZipArchiveEntry zae = zipFile.getEntry(rootOPF); if (zae == null || !zipFile.canReadEntryData(zae)) { - throw new EpubZipException(); + return Collections.EMPTY_SET; } try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) { opf.parse(tis, new DefaultHandler(), metadata, context); @@ -203,33 +170,14 @@ public class EpubParser implements Parser { try (InputStream is = zipFile.getInputStream(zae)) { XMLReaderUtils.parseSAX(is, contentOrderScraper, context); } - //if no content items, false if (contentOrderScraper.contentItems.isEmpty()) { - throw new EpubZipException(); + return Collections.EMPTY_SET; } String relativePath = ""; if (rootOPF.lastIndexOf("/") > -1) { relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1); } - if (isStrict) { - int found = 0; - for (String id : contentOrderScraper.contentItems) { - HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id); - if (hRefMediaPair != null && hRefMediaPair.href != null) { - zae = zipFile.getEntry(relativePath + hRefMediaPair.href); - if (zae != null && zipFile.canReadEntryData(zae)) { - found++; - } - } - } - //if not perfect match btwn items and readable items - //return false - if (found != contentOrderScraper.contentItems.size()) { - throw new EpubZipException(); - } - } - extractMetadata(zipFile, metadata, context); Set<String> encryptedItems = checkForDRM(zipFile); Set<String> processed = new HashSet<>(); @@ -306,12 +254,6 @@ public class EpubParser implements Parser { } } - private void checkForDRM(InputStream is, ParseContext parseContext) - throws IOException, TikaException, SAXException { - Set<String> encryptedItems = EncryptionHandler.parse(is, parseContext); - maybeThrowEncryptedException(encryptedItems); - } - private void maybeThrowEncryptedException(Set<String> encryptedItems) throws EncryptedDocumentException { if (encryptedItems.size() == 0) { @@ -437,55 +379,6 @@ public class EpubParser implements Parser { } } - //should only be used as a last resort on a truncated zip - private Set<String> streamingParse(InputStream stream, ContentHandler bodyHandler, - Metadata metadata, - ParseContext context) - throws IOException, TikaException, SAXException { - ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false); - - ZipArchiveEntry entry = zip.getNextEntry(); - SAXException sax = null; - while (entry != null) { - if (entry.getName().equals("mimetype")) { - updateMimeType(zip, metadata); - } else if (entry.getName().equals(META_INF_ENCRYPTION)) { - //when streaming, throw an encryption exception if anything is encrypted - checkForDRM(zip, context); - } else if (entry.getName().equals("metadata.xml")) { - try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) { - meta.parse(tisZip, new DefaultHandler(), metadata, context); - } - } else if (entry.getName().endsWith(".opf")) { - try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) { - opf.parse(tisZip, new DefaultHandler(), metadata, context); - } - } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") || - entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) { - try { - try (TikaInputStream tisZip = TikaInputStream.get(CloseShieldInputStream.wrap(zip))) { - content.parse(tisZip, bodyHandler, metadata, context); - } - } catch (SAXException e) { - if (WriteLimitReachedException.isWriteLimitReached(e)) { - throw e; - } - if (sax == null) { - sax = e; - } - } - } - entry = zip.getNextEntry(); - } - if (sax != null) { - throw sax; - } - //always empty -- we throw an encryption exception - //as soon as checkForDRM hits an encrypted item - return Collections.EMPTY_SET; - } - - private static class RootFinder extends DefaultHandler { String root = null; @@ -586,12 +479,6 @@ public class EpubParser implements Parser { } } - //any problem with parsing an epub file when it is - //a zip file - private static class EpubZipException extends IOException { - - } - //for now, this simply converts all names to local names to avoid //namespace conflicts in the content handler. This also removes namespaces //from attributes diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java index a9f7ea00a7..e8e1ffa50d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java @@ -33,7 +33,6 @@ import org.apache.tika.metadata.Epub; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.Parser; public class EpubParserTest extends TikaTest { @@ -88,10 +87,12 @@ public class EpubParserTest extends TikaTest { @Test public void testTruncated() throws Exception { - Parser p = new EpubParser(); + // Truncated zips are salvaged by DefaultZipContainerDetector and the + // recovered ZipFile is handed to EpubParser via openContainer. + // EpubParser itself no longer salvages — it relies on the detector. List<Metadata> metadataList; try (TikaInputStream tis = truncate("testEPUB.epub", 10000)) { - metadataList = getRecursiveMetadata(tis, p, true); + metadataList = getRecursiveMetadata(tis, true); } String xml = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); int ch1 = xml.indexOf("<h1>Chapter 1");
