Re: [PR] Add OCR encode parser module [tika]

via GitHub Thu, 16 Apr 2026 05:39:27 -0700


Copilot commented on code in PR #2769:
URL: https://github.com/apache/tika/pull/2769#discussion_r3093220055



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRConfig.java:
##########
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import java.io.Serializable;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Configuration for EncodeOCRParser. This class is not thread safe and must be
+ * synchronized externally.
+ * <p>
+ * This class will remember all set* field forever, and on
+ * {@link #cloneAndUpdate(EncodeOCRConfig)}, it will update all the fields that
+ * have been set on the "update" config. Create a new update config for each
+ * parse unless you're only changing the same field(s) with every parse.
+ */
+public class EncodeOCRConfig implements Serializable {
+
+    private static final long serialVersionUID = -1761942486845717891L;
+
+    private static final Logger LOG = LoggerFactory.getLogger(
+        EncodeOCRConfig.class
+    );

Review Comment:
   `LOG` is declared but never used in this class. This adds noise and suggests 
missing logging; please remove the unused logger (and its imports) or actually 
use it for meaningful diagnostics.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java:
##########
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.codec.binary.Base64InputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.ConfigDeserializer;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractExternalProcessParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Parser that base64-encodes image content instead of performing OCR
+ * text extraction. This is useful when you need to preserve the original
+ * image data in the parsed output for downstream processing by an
+ * external service.
+ * <p>
+ * To configure this parser, pass an {@link EncodeOCRConfig} object
+ * through the ParseContext, or configure it via tika-config.xml/json.
+ */
+@TikaComponent(spi = false)
+public class EncodeOCRParser
+        extends AbstractExternalProcessParser
+        implements Initializable {
+
+    private static final String OCR = "ocr-";
+    private static final Logger LOG = LoggerFactory.getLogger(
+            EncodeOCRParser.class);
+    private static final Object[] LOCK = new Object[0];
+    private static final long serialVersionUID = -8167538283213097266L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
+                    // these are not currently covered by other parsers
+                    MediaType.image("jp2"),
+                    MediaType.image("jpx"),
+                    MediaType.image("x-portable-pixmap"),
+                    // add the ocr- versions as well
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap")
+            )));
+    private static volatile boolean hasWarned = false;
+
+    private EncodeOCRConfig defaultConfig = new EncodeOCRConfig();
+
+    public EncodeOCRParser() {
+    }
+
+    public EncodeOCRParser(EncodeOCRConfig config) {
+        this.defaultConfig = config;
+    }
+
+    /**
+     * Constructor for JSON configuration.
+     * Requires Jackson on the classpath.
+     *
+     * @param jsonConfig JSON configuration
+     */
+    public EncodeOCRParser(JsonConfig jsonConfig) {
+        this(ConfigDeserializer.buildConfig(
+                jsonConfig, EncodeOCRConfig.class));
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        //no-op
+    }
+
+    public void checkInitialization() throws TikaConfigException {
+        //no-op
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        EncodeOCRConfig userConfig = context.get(EncodeOCRConfig.class);
+        EncodeOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            try {
+                config = defaultConfig.cloneAndUpdate(userConfig);
+            } catch (TikaException e) {
+                LOG.warn("Failed to merge config, using default", e);
+            }
+        }
+        if (config.isSkipOcr()) {
+            return Collections.emptySet();
+        }
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(
+            TikaInputStream tis,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext
+    ) throws IOException, SAXException, TikaException {
+        normalizeOCRMimeMetadata(metadata);
+
+        ParseContext workingContext =
+                parseContext != null ? parseContext : new ParseContext();
+
+        EncodeOCRConfig userConfig = workingContext.get(
+                EncodeOCRConfig.class);
+        EncodeOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            config = defaultConfig.cloneAndUpdate(userConfig);
+        }
+
+        if (config != null && config.isSkipOcr()) {
+            return;
+        }
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            TikaInputStream tikaStream = TikaInputStream.get(
+                    tis, tmp, metadata);
+
+            ContentHandler baseHandler = getContentHandler(
+                    config.isInlineContent(),
+                    handler,
+                    metadata,
+                    workingContext);
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(
+                    baseHandler, metadata, workingContext);
+            xhtml.startDocument();
+            doEncode(tikaStream, xhtml, metadata, workingContext, config);
+            xhtml.endDocument();
+        }
+    }
+
+    private ContentHandler getContentHandler(
+            boolean isInlineContent,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext) {
+        if (!isInlineContent) {
+            return handler;
+        }
+        ParentContentHandler parentContentHandler = parseContext.get(
+                ParentContentHandler.class);
+        if (parentContentHandler == null) {
+            return handler;
+        }
+        String embeddedType = metadata.get(
+                TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+        if (!TikaCoreProperties.EmbeddedResourceType.INLINE.name()
+                .equals(embeddedType)) {
+            return handler;
+        }
+        return new TeeContentHandler(
+                new EmbeddedContentHandler(
+                        new BodyContentHandler(
+                                parentContentHandler.getContentHandler())),
+                handler);
+    }
+
+    private void normalizeOCRMimeMetadata(Metadata metadata) {
+        String parserOverride = metadata.get(
+                TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE);
+        if (parserOverride != null) {
+            MediaType overrideType = MediaType.parse(parserOverride);
+            if (overrideType != null
+                    && overrideType.getSubtype().startsWith(OCR)) {
+                metadata.remove(TikaCoreProperties
+                        .CONTENT_TYPE_PARSER_OVERRIDE.getName());
+            }
+        }
+        String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        if (contentType != null) {
+            MediaType parsedType = MediaType.parse(contentType);
+            if (parsedType != null
+                    && parsedType.getSubtype().startsWith(OCR)) {
+                metadata.set(Metadata.CONTENT_TYPE,
+                        new MediaType(parsedType.getType(),
+                                parsedType.getSubtype().substring(
+                                        OCR.length())).toString());
+            }
+        }
+    }
+
+    private void doEncode(
+            TikaInputStream tikaInputStream,
+            ContentHandler xhtml,
+            Metadata metadata,
+            ParseContext parseContext,
+            EncodeOCRConfig config
+    ) throws IOException, SAXException, TikaException {
+        warnOnFirstParse();
+
+        long size = tikaInputStream.getLength();
+        if (size >= config.getMinFileSizeToOcr()
+                && size <= config.getMaxFileSizeToOcr()) {
+            if (!reserveImageSlot(parseContext, config)) {
+                OCRImageCounter counter = parseContext.get(
+                        OCRImageCounter.class);
+                int processed = counter != null
+                        ? counter.get()
+                        : config.getMaxImagesToOcr();
+                LOG.info("Skipping OCR encode for image because "
+                                + "the configured limit of {} images "
+                                + "has been reached ({} already processed)",
+                        config.getMaxImagesToOcr(), processed);
+                return;
+            }
+            encodeToBase64(tikaInputStream, size, xhtml);
+        } else {
+            LOG.debug("File size {} is outside the allowed "
+                            + "range for OCR encode: {} - {}",
+                    size,
+                    config.getMinFileSizeToOcr(),
+                    config.getMaxFileSizeToOcr());
+        }
+    }
+
+    private boolean reserveImageSlot(
+            ParseContext parseContext,
+            EncodeOCRConfig config) {
+        OCRImageCounter counter = parseContext.get(OCRImageCounter.class);
+        if (counter == null) {
+            counter = new OCRImageCounter();
+            parseContext.set(OCRImageCounter.class, counter);
+        }
+        return counter.tryIncrement(config.getMaxImagesToOcr());
+    }
+
+    private void encodeToBase64(
+            InputStream input,
+            long fileSize,
+            ContentHandler xhtml
+    ) throws IOException, TikaException {
+        long startTime = System.nanoTime();
+
+        try (Base64InputStream base64InputStream =
+                     new Base64InputStream(input, true)) {
+            int outputSize = extractOutput(base64InputStream, xhtml);
+            long endTime = System.nanoTime();
+            long durationMs = (endTime - startTime) / 1_000_000;
+            LOG.info("OCR encoding - input file size: {} bytes, "
+                            + "output size: {} characters, "
+                            + "time taken: {} ms",
+                    fileSize, outputSize, durationMs);
+        } catch (IOException | SAXException e) {

Review Comment:
   Per-image encoding logs at INFO level (`OCR encoding - input file size...`). 
When parsing documents with many embedded images this can generate very 
high-volume logs and impact operations. Consider downgrading this to DEBUG (or 
making it conditional on an explicit diagnostic flag), keeping only the 
one-time warning at INFO.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/test/java/org/apache/tika/parser/ocrencode/EncodeOCRParserTest.java:
##########
@@ -0,0 +1,443 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Base64;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+
+public class EncodeOCRParserTest extends TikaTest {
+
+    // Markers as they appear in XML-serialized output
+    // (< and > are escaped by ToXMLContentHandler)
+    private static final String BEGIN_MARKER_XML =
+            "&lt;&lt;&lt;---IMAGE-BASE64-ENCODED-BEGIN---"
+                    + "&gt;&gt;&gt;";
+    private static final String END_MARKER_XML =
+            "&lt;&lt;&lt;---IMAGE-BASE64-ENCODED-END---"
+                    + "&gt;&gt;&gt;";
+
+    private Metadata getMetadata(MediaType mediaType) {
+        Metadata metadata = new Metadata();
+        MediaType ocrMediaType =
+                new MediaType(mediaType.getType(),
+                        "ocr-" + mediaType.getSubtype());
+        metadata.set(
+                TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+                ocrMediaType.toString());
+        return metadata;
+    }
+
+    @Test
+    public void testBasicEncoding() throws Exception {
+        byte[] simplePng = createSimplePng();
+
+        EncodeOCRParser parser = new EncodeOCRParser();
+        TikaInputStream tis = TikaInputStream.get(simplePng);
+        String xml = getXML(tis, parser,
+                getMetadata(MediaType.image("png"))).xml;
+
+        // Should contain the base64 markers
+        assertContains(BEGIN_MARKER_XML, xml);
+        assertContains(END_MARKER_XML, xml);
+
+        // Should contain base64 encoded content
+        assertTrue(xml.contains("iVBOR"),
+                "Should contain base64 PNG signature");
+
+        // Should have ocr class div
+        assertContains("<div class=\"ocr\">", xml);
+    }
+
+    @Test
+    public void testBasicEncodingWithResourceFile() throws Exception {
+        EncodeOCRParser parser = new EncodeOCRParser();
+        String xml = getXML("testOCR_encode.png", parser,
+                getMetadata(MediaType.image("png"))).xml;
+
+        assertContains(BEGIN_MARKER_XML, xml);
+        assertContains(END_MARKER_XML, xml);
+        assertTrue(xml.contains("iVBOR"),
+                "Should contain base64 PNG signature");
+        assertContains("<div class=\"ocr\">", xml);
+    }
+
+    @Test
+    public void testJpegEncoding() throws Exception {
+        EncodeOCRParser parser = new EncodeOCRParser();
+        String xml = getXML("testOCR_encode.jpg", parser,
+                getMetadata(MediaType.image("jpeg"))).xml;
+
+        assertContains(BEGIN_MARKER_XML, xml);
+        assertContains(END_MARKER_XML, xml);
+        // JPEG base64 starts with /9j/
+        assertTrue(xml.contains("/9j/"),
+                "Should contain base64 JPEG signature");
+    }
+
+    @Test
+    public void testBase64OutputIsDecodable() throws Exception {
+        byte[] simplePng = createSimplePng();
+

Review Comment:
   `simplePng` is created but never used in this test method. Please remove the 
unused local variable to keep the test focused and avoid unused-variable 
warnings.
   ```suggestion
   
   ```



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java:
##########
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.codec.binary.Base64InputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.ConfigDeserializer;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.JsonConfig;
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractExternalProcessParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Parser that base64-encodes image content instead of performing OCR
+ * text extraction. This is useful when you need to preserve the original
+ * image data in the parsed output for downstream processing by an
+ * external service.
+ * <p>
+ * To configure this parser, pass an {@link EncodeOCRConfig} object
+ * through the ParseContext, or configure it via tika-config.xml/json.
+ */
+@TikaComponent(spi = false)
+public class EncodeOCRParser
+        extends AbstractExternalProcessParser
+        implements Initializable {
+
+    private static final String OCR = "ocr-";
+    private static final Logger LOG = LoggerFactory.getLogger(
+            EncodeOCRParser.class);
+    private static final Object[] LOCK = new Object[0];
+    private static final long serialVersionUID = -8167538283213097266L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
+                    // these are not currently covered by other parsers
+                    MediaType.image("jp2"),
+                    MediaType.image("jpx"),
+                    MediaType.image("x-portable-pixmap"),
+                    // add the ocr- versions as well
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap")
+            )));
+    private static volatile boolean hasWarned = false;
+
+    private EncodeOCRConfig defaultConfig = new EncodeOCRConfig();
+
+    public EncodeOCRParser() {
+    }
+
+    public EncodeOCRParser(EncodeOCRConfig config) {
+        this.defaultConfig = config;
+    }
+
+    /**
+     * Constructor for JSON configuration.
+     * Requires Jackson on the classpath.
+     *
+     * @param jsonConfig JSON configuration
+     */
+    public EncodeOCRParser(JsonConfig jsonConfig) {
+        this(ConfigDeserializer.buildConfig(
+                jsonConfig, EncodeOCRConfig.class));
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        //no-op
+    }
+
+    public void checkInitialization() throws TikaConfigException {
+        //no-op
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        EncodeOCRConfig userConfig = context.get(EncodeOCRConfig.class);
+        EncodeOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            try {
+                config = defaultConfig.cloneAndUpdate(userConfig);
+            } catch (TikaException e) {
+                LOG.warn("Failed to merge config, using default", e);
+            }
+        }
+        if (config.isSkipOcr()) {
+            return Collections.emptySet();
+        }
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(
+            TikaInputStream tis,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext
+    ) throws IOException, SAXException, TikaException {
+        normalizeOCRMimeMetadata(metadata);
+
+        ParseContext workingContext =
+                parseContext != null ? parseContext : new ParseContext();
+
+        EncodeOCRConfig userConfig = workingContext.get(
+                EncodeOCRConfig.class);
+        EncodeOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            config = defaultConfig.cloneAndUpdate(userConfig);
+        }
+
+        if (config != null && config.isSkipOcr()) {
+            return;
+        }
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            TikaInputStream tikaStream = TikaInputStream.get(
+                    tis, tmp, metadata);
+
+            ContentHandler baseHandler = getContentHandler(
+                    config.isInlineContent(),
+                    handler,
+                    metadata,
+                    workingContext);
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(
+                    baseHandler, metadata, workingContext);
+            xhtml.startDocument();
+            doEncode(tikaStream, xhtml, metadata, workingContext, config);
+            xhtml.endDocument();
+        }
+    }
+
+    private ContentHandler getContentHandler(
+            boolean isInlineContent,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext) {
+        if (!isInlineContent) {
+            return handler;
+        }
+        ParentContentHandler parentContentHandler = parseContext.get(
+                ParentContentHandler.class);
+        if (parentContentHandler == null) {
+            return handler;
+        }
+        String embeddedType = metadata.get(
+                TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+        if (!TikaCoreProperties.EmbeddedResourceType.INLINE.name()
+                .equals(embeddedType)) {
+            return handler;
+        }
+        return new TeeContentHandler(
+                new EmbeddedContentHandler(
+                        new BodyContentHandler(
+                                parentContentHandler.getContentHandler())),
+                handler);
+    }
+
+    private void normalizeOCRMimeMetadata(Metadata metadata) {
+        String parserOverride = metadata.get(
+                TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE);
+        if (parserOverride != null) {
+            MediaType overrideType = MediaType.parse(parserOverride);
+            if (overrideType != null
+                    && overrideType.getSubtype().startsWith(OCR)) {
+                metadata.remove(TikaCoreProperties
+                        .CONTENT_TYPE_PARSER_OVERRIDE.getName());
+            }
+        }
+        String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        if (contentType != null) {
+            MediaType parsedType = MediaType.parse(contentType);
+            if (parsedType != null
+                    && parsedType.getSubtype().startsWith(OCR)) {
+                metadata.set(Metadata.CONTENT_TYPE,
+                        new MediaType(parsedType.getType(),
+                                parsedType.getSubtype().substring(
+                                        OCR.length())).toString());
+            }
+        }
+    }
+
+    private void doEncode(
+            TikaInputStream tikaInputStream,
+            ContentHandler xhtml,
+            Metadata metadata,
+            ParseContext parseContext,
+            EncodeOCRConfig config
+    ) throws IOException, SAXException, TikaException {
+        warnOnFirstParse();
+
+        long size = tikaInputStream.getLength();
+        if (size >= config.getMinFileSizeToOcr()
+                && size <= config.getMaxFileSizeToOcr()) {
+            if (!reserveImageSlot(parseContext, config)) {
+                OCRImageCounter counter = parseContext.get(
+                        OCRImageCounter.class);
+                int processed = counter != null
+                        ? counter.get()
+                        : config.getMaxImagesToOcr();
+                LOG.info("Skipping OCR encode for image because "
+                                + "the configured limit of {} images "
+                                + "has been reached ({} already processed)",
+                        config.getMaxImagesToOcr(), processed);
+                return;
+            }
+            encodeToBase64(tikaInputStream, size, xhtml);
+        } else {
+            LOG.debug("File size {} is outside the allowed "
+                            + "range for OCR encode: {} - {}",
+                    size,
+                    config.getMinFileSizeToOcr(),
+                    config.getMaxFileSizeToOcr());
+        }
+    }
+
+    private boolean reserveImageSlot(
+            ParseContext parseContext,
+            EncodeOCRConfig config) {
+        OCRImageCounter counter = parseContext.get(OCRImageCounter.class);
+        if (counter == null) {
+            counter = new OCRImageCounter();
+            parseContext.set(OCRImageCounter.class, counter);
+        }
+        return counter.tryIncrement(config.getMaxImagesToOcr());
+    }
+
+    private void encodeToBase64(
+            InputStream input,
+            long fileSize,
+            ContentHandler xhtml
+    ) throws IOException, TikaException {
+        long startTime = System.nanoTime();
+
+        try (Base64InputStream base64InputStream =
+                     new Base64InputStream(input, true)) {
+            int outputSize = extractOutput(base64InputStream, xhtml);
+            long endTime = System.nanoTime();
+            long durationMs = (endTime - startTime) / 1_000_000;
+            LOG.info("OCR encoding - input file size: {} bytes, "
+                            + "output size: {} characters, "
+                            + "time taken: {} ms",
+                    fileSize, outputSize, durationMs);
+        } catch (IOException | SAXException e) {
+            throw new IOException("Error encoding bytes to base64", e);
+        }
+    }
+
+    private void warnOnFirstParse() {
+        if (!hasWarned) {
+            synchronized (LOCK) {
+                if (!hasWarned) {
+                    LOG.info("OCR encode is being invoked. "
+                            + "This can add greatly to processing time.");
+                    hasWarned = true;
+                }
+            }
+        }
+    }
+
+    /**
+     * Reads the contents of the given stream and writes it to the
+     * given XHTML content handler. The stream is closed once fully
+     * processed.
+     *
+     * @param stream Stream containing the base64-encoded data
+     * @param xhtml  XHTML content handler
+     * @return number of characters written
+     * @throws SAXException if the XHTML SAX events could not be handled
+     * @throws IOException  if an input error occurred
+     */
+    private int extractOutput(InputStream stream, ContentHandler xhtml)
+            throws SAXException, IOException {
+        AttributesImpl attrs = new AttributesImpl();
+        attrs.addAttribute("", "class", "class", "CDATA", "ocr");
+        xhtml.startElement(XHTML, "div", "div", attrs);
+
+        String beginMarker =
+                "\n<<<---IMAGE-BASE64-ENCODED-BEGIN--->>>\n";
+        xhtml.characters(
+                beginMarker.toCharArray(), 0, beginMarker.length());
+        int totalChars = 0;
+
+        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+            char[] buffer = new char[1024];
+            int n;
+            while ((n = reader.read(buffer)) != -1) {
+                if (n > 0) {
+                    totalChars += n;
+                    xhtml.characters(buffer, 0, n);
+                }
+            }
+        }
+        String endMarker =
+                "\n<<<---IMAGE-BASE64-ENCODED-END--->>>\n";
+        xhtml.characters(
+                endMarker.toCharArray(), 0, endMarker.length());
+
+        xhtml.endElement(XHTML, "div", "div");
+        return totalChars;
+    }

Review Comment:
   `extractOutput` counts written characters in an `int` (`totalChars`). With 
`maxFileSizeToOcr` defaulting to `Integer.MAX_VALUE`, base64 output can exceed 
2^31-1 chars, causing integer overflow and incorrect logging/metrics. Use a 
`long` counter (and return type / log variable) to avoid overflow.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add OCR encode parser module [tika]

Reply via email to