This is an automated email from the ASF dual-hosted git repository.

nddipiazza pushed a commit to branch TIKA-4722-grpc-handler-type
in repository https://gitbox.apache.org/repos/asf/tika.git

commit acf2237c512b4b8c31686ef160a50ebfa120b63e
Author: Nicholas DiPiazza <[email protected]>
AuthorDate: Fri May 1 07:19:54 2026 -0500

    TIKA-4722: Add handler_type field to FetchAndParseRequest for per-request 
content handler configuration
    
    - Add handler_type (string, field 5) to FetchAndParseRequest proto message
    - TikaGrpcServerImpl maps handler_type to BasicContentHandlerFactory in 
ParseContext
    - Valid values: text (default), html, xml, body, ignore, markdown
    - Add HandlerTypeTest e2e test verifying HTML vs text output differs
    
    Co-authored-by: Copilot <[email protected]>
---
 .../tika/pipes/filesystem/HandlerTypeTest.java     | 114 +++++++++++++++++++++
 .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java |   8 ++
 tika-grpc/src/main/proto/tika.proto                |   4 +
 3 files changed, 126 insertions(+)

diff --git 
a/tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/filesystem/HandlerTypeTest.java
 
b/tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/filesystem/HandlerTypeTest.java
new file mode 100644
index 0000000000..746f9aa229
--- /dev/null
+++ 
b/tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/filesystem/HandlerTypeTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.filesystem;
+
+import java.util.concurrent.TimeUnit;
+
+import io.grpc.ManagedChannel;
+import lombok.extern.slf4j.Slf4j;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.DisabledOnOs;
+import org.junit.jupiter.api.condition.OS;
+
+import org.apache.tika.FetchAndParseReply;
+import org.apache.tika.FetchAndParseRequest;
+import org.apache.tika.SaveFetcherReply;
+import org.apache.tika.SaveFetcherRequest;
+import org.apache.tika.TikaGrpc;
+import org.apache.tika.pipes.ExternalTestBase;
+import org.apache.tika.pipes.fetcher.fs.FileSystemFetcherConfig;
+
+/**
+ * Tests per-request handler_type configuration via FetchAndParseRequest.
+ *
+ * Verifies that clients can override the output content format on a 
per-request basis
+ * by setting handler_type to "html", "text", "xml", etc.
+ */
+@Slf4j
+@DisabledOnOs(value = OS.WINDOWS, disabledReason = "exec:exec classpath 
exceeds Windows CreateProcess command-line length limit")
+class HandlerTypeTest extends ExternalTestBase {
+
+    @Test
+    void testHtmlHandlerType() throws Exception {
+        String fetcherId = "handlerTypeFetcher";
+        ManagedChannel channel = getManagedChannel();
+        try {
+            TikaGrpc.TikaBlockingStub blockingStub = 
TikaGrpc.newBlockingStub(channel);
+
+            FileSystemFetcherConfig config = new FileSystemFetcherConfig();
+            boolean useLocalServer = 
Boolean.parseBoolean(System.getProperty("tika.e2e.useLocalServer", "true"));
+            config.setBasePath(useLocalServer ? TEST_FOLDER.getAbsolutePath() 
: GOV_DOCS_FOLDER);
+
+            SaveFetcherReply saveReply = 
blockingStub.saveFetcher(SaveFetcherRequest.newBuilder()
+                    .setFetcherId(fetcherId)
+                    
.setFetcherClass("org.apache.tika.pipes.fetcher.fs.FileSystemFetcher")
+                    
.setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(config))
+                    .build());
+            log.info("Fetcher created: {}", saveReply.getFetcherId());
+
+            // Parse the sample HTML file with handler_type=html — expect HTML 
tags in output
+            FetchAndParseReply htmlReply = 
blockingStub.fetchAndParse(FetchAndParseRequest.newBuilder()
+                    .setFetcherId(fetcherId)
+                    .setFetchKey("test-fixtures/sample.html")
+                    .setHandlerType("html")
+                    .build());
+
+            log.info("HTML parse status: {}", htmlReply.getStatus());
+            Assertions.assertEquals("PARSE_SUCCESS", htmlReply.getStatus(),
+                    "Parse should succeed with handler_type=html");
+
+            String content = htmlReply.getFieldsMap().get("X-TIKA:content");
+            Assertions.assertNotNull(content, "Content should be present in 
response");
+            log.info("HTML content (first 200 chars): {}", 
content.substring(0, Math.min(200, content.length())));
+            Assertions.assertTrue(content.contains("<html") || 
content.contains("<body") || content.contains("<p"),
+                    "handler_type=html should produce HTML markup in content, 
got: " + content);
+
+            // Parse the same file with handler_type=text — expect plain text, 
no HTML tags
+            FetchAndParseReply textReply = 
blockingStub.fetchAndParse(FetchAndParseRequest.newBuilder()
+                    .setFetcherId(fetcherId)
+                    .setFetchKey("test-fixtures/sample.html")
+                    .setHandlerType("text")
+                    .build());
+
+            log.info("Text parse status: {}", textReply.getStatus());
+            Assertions.assertEquals("PARSE_SUCCESS", textReply.getStatus(),
+                    "Parse should succeed with handler_type=text");
+
+            String textContent = 
textReply.getFieldsMap().get("X-TIKA:content");
+            Assertions.assertNotNull(textContent, "Content should be present 
in text response");
+            log.info("Text content (first 200 chars): {}", 
textContent.substring(0, Math.min(200, textContent.length())));
+            Assertions.assertFalse(textContent.contains("<html") || 
textContent.contains("<body"),
+                    "handler_type=text should not produce HTML tags, got: " + 
textContent);
+
+            // Verify the two outputs differ
+            Assertions.assertNotEquals(content, textContent,
+                    "HTML and text outputs should differ for the same 
document");
+
+        } finally {
+            channel.shutdown();
+            try {
+                if (!channel.awaitTermination(5, TimeUnit.SECONDS)) {
+                    channel.shutdownNow();
+                }
+            } catch (InterruptedException e) {
+                channel.shutdownNow();
+                Thread.currentThread().interrupt();
+            }
+        }
+    }
+}
diff --git 
a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java 
b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
index 8c6b96a415..7da458174c 100644
--- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
+++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
@@ -74,6 +74,7 @@ import org.apache.tika.pipes.core.fetcher.FetcherManager;
 import org.apache.tika.pipes.ignite.server.IgniteStoreServer;
 import org.apache.tika.plugins.ExtensionConfig;
 import org.apache.tika.plugins.TikaPluginManager;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 
 class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
     private static final Logger LOG = 
LoggerFactory.getLogger(TikaGrpcServerImpl.class);
@@ -223,6 +224,13 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
             if (StringUtils.isNotBlank(additionalFetchConfigJson)) {
                 parseContext.setJsonConfig(request.getFetcherId(), 
additionalFetchConfigJson);
             }
+            String handlerType = request.getHandlerType();
+            if (StringUtils.isNotBlank(handlerType)) {
+                BasicContentHandlerFactory.HANDLER_TYPE type =
+                        
BasicContentHandlerFactory.parseHandlerType(handlerType, 
BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+                parseContext.set(BasicContentHandlerFactory.class,
+                        new BasicContentHandlerFactory(type, -1));
+            }
             PipesResult pipesResult = pipesClient.process(new 
FetchEmitTuple(request.getFetchKey(), new 
FetchKey(fetcher.getExtensionConfig().id(), request.getFetchKey()),
                     new EmitKey(), tikaMetadata, parseContext, 
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
             FetchAndParseReply.Builder fetchReplyBuilder =
diff --git a/tika-grpc/src/main/proto/tika.proto 
b/tika-grpc/src/main/proto/tika.proto
index 70fdf2f0af..d715cca37b 100644
--- a/tika-grpc/src/main/proto/tika.proto
+++ b/tika-grpc/src/main/proto/tika.proto
@@ -100,6 +100,10 @@ message FetchAndParseRequest {
   string additional_fetch_config_json = 3;
   // The ID of the emitter to use (optional). If not provided, no emitter will 
be used.
   string emitter_id = 4;
+  // The content handler type to use for this request, overriding the server 
default.
+  // Valid values: "text" (default), "html", "xml", "body", "ignore", 
"markdown".
+  // Use "html" to get structured HTML output instead of plain text.
+  string handler_type = 5;
 }
 
 message FetchAndParseReply {

Reply via email to