This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4749 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0df6cf2e277c8b19ef4ba71490c3c960b6e37b79 Author: tallison <[email protected]> AuthorDate: Wed Jun 3 14:50:52 2026 -0400 TIKA-4749 - improve inline handling of metadata only --- .../org/apache/tika/parser/AutoDetectParser.java | 4 +++ .../org/apache/tika/parser/MetadataOnlyParse.java | 35 +++++++++++++++++++++ .../tika/parser/pdf/image/ImageGraphicsEngine.java | 12 +++----- .../org/apache/tika/parser/pdf/PDFParserTest.java | 36 ++++++++++++++++++++-- 4 files changed, 78 insertions(+), 9 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 6867c622d2..6cff66c0b5 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -160,6 +160,10 @@ public class AutoDetectParser extends CompositeParser { // don't leak into CONTENT_TYPE metadata.set(Metadata.CONTENT_TYPE, EmbeddedDocumentUtil.normalizeMediaType(type.toString())); + // Metadata-only pseudo-parse: register the entry, skip the content parse. + if (context.get(MetadataOnlyParse.class) != null) { + return; + } //check for zero-byte inputstream if (tis.getOpenContainer() == null) { if (autoDetectParserConfig.getThrowOnZeroBytes()) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/MetadataOnlyParse.java b/tika-core/src/main/java/org/apache/tika/parser/MetadataOnlyParse.java new file mode 100644 index 0000000000..26840cf9bc --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/parser/MetadataOnlyParse.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +/** + * ParseContext marker telling {@link AutoDetectParser} to register the embedded + * entry but skip the content parse. Set by metadata-only passes that pseudo-parse + * a placeholder stream only to register an entry. Independent of throwOnZeroBytes. + */ +public final class MetadataOnlyParse { + + /** + * Singleton instance indicating the current parse should not dispatch to a + * content parser. + */ + public static final MetadataOnlyParse INSTANCE = new MetadataOnlyParse(); + + private MetadataOnlyParse() { + // Private constructor for singleton + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java index 72f95d8a1d..f77c317236 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java @@ -57,7 +57,6 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaMemoryLimitException; -import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.BoundedInputStream; @@ -65,6 +64,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaPagedText; +import org.apache.tika.parser.MetadataOnlyParse; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.pdf.PDMetadataExtractor; @@ -448,16 +448,14 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine { metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth()); metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight()); //TODO: what else can we extract from the PDImage without rendering? - ZeroByteFileException.IgnoreZeroByteFileException before = - parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class); + //Register the image's metadata entry without decoding it (marker skips the parse). try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { - parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, - ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION); + parseContext.set(MetadataOnlyParse.class, MetadataOnlyParse.INSTANCE); embeddedDocumentExtractor.parseEmbedded(tis, new EmbeddedContentHandler(xhtml), metadata, parseContext, false); } finally { - //replace whatever was there before - parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, before); + //clear so it can't leak to the next image + parseContext.set(MetadataOnlyParse.class, null); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index ccae004def..b8d3265b8f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -47,7 +47,6 @@ import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Font; @@ -62,7 +61,9 @@ import org.apache.tika.metadata.XMPMM; import org.apache.tika.metadata.XMPPDF; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.AutoDetectParserConfig; import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.MetadataOnlyParse; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; @@ -1358,7 +1359,7 @@ public class PDFParserTest extends TikaTest { config.setExtractInlineImageMetadataOnly(true); context.set(PDFParserConfig.class, config); List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf", context); - assertNull(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class)); + assertNull(context.get(MetadataOnlyParse.class)); assertEquals(2, metadataList.size()); assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE)); assertEquals("/image-0.png", @@ -1368,6 +1369,37 @@ public class PDFParserTest extends TikaTest { assertEquals("image-0.png", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); } + @Test + public void testExtractInlineImageMetadataThrowOnZeroBytesFalse() throws Exception { + //TIKA-4749: in metadata-only mode the inline image is registered via a + //placeholder pseudo-parse. With throwOnZeroBytes=false that placeholder used + //to be handed to a real parser (image/OCR), recording a spurious embedded + //exception. The MetadataOnlyParse marker must make it skip the parse instead. + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImageMetadataOnly(true); + context.set(PDFParserConfig.class, config); + + AutoDetectParser p = new AutoDetectParser(); + AutoDetectParserConfig adpc = new AutoDetectParserConfig(); + adpc.setThrowOnZeroBytes(false); + p.setAutoDetectParserConfig(adpc); + + List<Metadata> metadataList = + getRecursiveMetadata("testOCR.pdf", p, new Metadata(), context, false); + assertNull(context.get(MetadataOnlyParse.class)); + assertEquals(2, metadataList.size()); + Metadata image = metadataList.get(1); + assertEquals("image/png", image.get(Metadata.CONTENT_TYPE)); + assertEquals(261, (int) image.getInt(Metadata.IMAGE_LENGTH)); + assertEquals(934, (int) image.getInt(Metadata.IMAGE_WIDTH)); + //the placeholder must not be dispatched to any content parser. Without the + //fix it is (EmptyParser here; ImageParser+TesseractOCRParser when tesseract + //is installed, which is what records the spurious embedded exception). + assertEquals(0, image.getValues(TikaCoreProperties.TIKA_PARSED_BY).length); + assertNull(image.get(TikaCoreProperties.EMBEDDED_EXCEPTION)); + } + /** * Simple class to count end of document events. If functionality is useful, * move to org.apache.tika in src/test
