This is an automated email from the ASF dual-hosted git repository. nddipiazza pushed a commit to branch TIKA-4722-grpc-handler-type in repository https://gitbox.apache.org/repos/asf/tika.git
commit acf2237c512b4b8c31686ef160a50ebfa120b63e Author: Nicholas DiPiazza <[email protected]> AuthorDate: Fri May 1 07:19:54 2026 -0500 TIKA-4722: Add handler_type field to FetchAndParseRequest for per-request content handler configuration - Add handler_type (string, field 5) to FetchAndParseRequest proto message - TikaGrpcServerImpl maps handler_type to BasicContentHandlerFactory in ParseContext - Valid values: text (default), html, xml, body, ignore, markdown - Add HandlerTypeTest e2e test verifying HTML vs text output differs Co-authored-by: Copilot <[email protected]> --- .../tika/pipes/filesystem/HandlerTypeTest.java | 114 +++++++++++++++++++++ .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 8 ++ tika-grpc/src/main/proto/tika.proto | 4 + 3 files changed, 126 insertions(+) diff --git a/tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/filesystem/HandlerTypeTest.java b/tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/filesystem/HandlerTypeTest.java new file mode 100644 index 0000000000..746f9aa229 --- /dev/null +++ b/tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/filesystem/HandlerTypeTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.filesystem; + +import java.util.concurrent.TimeUnit; + +import io.grpc.ManagedChannel; +import lombok.extern.slf4j.Slf4j; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.DisabledOnOs; +import org.junit.jupiter.api.condition.OS; + +import org.apache.tika.FetchAndParseReply; +import org.apache.tika.FetchAndParseRequest; +import org.apache.tika.SaveFetcherReply; +import org.apache.tika.SaveFetcherRequest; +import org.apache.tika.TikaGrpc; +import org.apache.tika.pipes.ExternalTestBase; +import org.apache.tika.pipes.fetcher.fs.FileSystemFetcherConfig; + +/** + * Tests per-request handler_type configuration via FetchAndParseRequest. + * + * Verifies that clients can override the output content format on a per-request basis + * by setting handler_type to "html", "text", "xml", etc. + */ +@Slf4j +@DisabledOnOs(value = OS.WINDOWS, disabledReason = "exec:exec classpath exceeds Windows CreateProcess command-line length limit") +class HandlerTypeTest extends ExternalTestBase { + + @Test + void testHtmlHandlerType() throws Exception { + String fetcherId = "handlerTypeFetcher"; + ManagedChannel channel = getManagedChannel(); + try { + TikaGrpc.TikaBlockingStub blockingStub = TikaGrpc.newBlockingStub(channel); + + FileSystemFetcherConfig config = new FileSystemFetcherConfig(); + boolean useLocalServer = Boolean.parseBoolean(System.getProperty("tika.e2e.useLocalServer", "true")); + config.setBasePath(useLocalServer ? TEST_FOLDER.getAbsolutePath() : GOV_DOCS_FOLDER); + + SaveFetcherReply saveReply = blockingStub.saveFetcher(SaveFetcherRequest.newBuilder() + .setFetcherId(fetcherId) + .setFetcherClass("org.apache.tika.pipes.fetcher.fs.FileSystemFetcher") + .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(config)) + .build()); + log.info("Fetcher created: {}", saveReply.getFetcherId()); + + // Parse the sample HTML file with handler_type=html — expect HTML tags in output + FetchAndParseReply htmlReply = blockingStub.fetchAndParse(FetchAndParseRequest.newBuilder() + .setFetcherId(fetcherId) + .setFetchKey("test-fixtures/sample.html") + .setHandlerType("html") + .build()); + + log.info("HTML parse status: {}", htmlReply.getStatus()); + Assertions.assertEquals("PARSE_SUCCESS", htmlReply.getStatus(), + "Parse should succeed with handler_type=html"); + + String content = htmlReply.getFieldsMap().get("X-TIKA:content"); + Assertions.assertNotNull(content, "Content should be present in response"); + log.info("HTML content (first 200 chars): {}", content.substring(0, Math.min(200, content.length()))); + Assertions.assertTrue(content.contains("<html") || content.contains("<body") || content.contains("<p"), + "handler_type=html should produce HTML markup in content, got: " + content); + + // Parse the same file with handler_type=text — expect plain text, no HTML tags + FetchAndParseReply textReply = blockingStub.fetchAndParse(FetchAndParseRequest.newBuilder() + .setFetcherId(fetcherId) + .setFetchKey("test-fixtures/sample.html") + .setHandlerType("text") + .build()); + + log.info("Text parse status: {}", textReply.getStatus()); + Assertions.assertEquals("PARSE_SUCCESS", textReply.getStatus(), + "Parse should succeed with handler_type=text"); + + String textContent = textReply.getFieldsMap().get("X-TIKA:content"); + Assertions.assertNotNull(textContent, "Content should be present in text response"); + log.info("Text content (first 200 chars): {}", textContent.substring(0, Math.min(200, textContent.length()))); + Assertions.assertFalse(textContent.contains("<html") || textContent.contains("<body"), + "handler_type=text should not produce HTML tags, got: " + textContent); + + // Verify the two outputs differ + Assertions.assertNotEquals(content, textContent, + "HTML and text outputs should differ for the same document"); + + } finally { + channel.shutdown(); + try { + if (!channel.awaitTermination(5, TimeUnit.SECONDS)) { + channel.shutdownNow(); + } + } catch (InterruptedException e) { + channel.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + } +} diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java index 8c6b96a415..7da458174c 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -74,6 +74,7 @@ import org.apache.tika.pipes.core.fetcher.FetcherManager; import org.apache.tika.pipes.ignite.server.IgniteStoreServer; import org.apache.tika.plugins.ExtensionConfig; import org.apache.tika.plugins.TikaPluginManager; +import org.apache.tika.sax.BasicContentHandlerFactory; class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcServerImpl.class); @@ -223,6 +224,13 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { if (StringUtils.isNotBlank(additionalFetchConfigJson)) { parseContext.setJsonConfig(request.getFetcherId(), additionalFetchConfigJson); } + String handlerType = request.getHandlerType(); + if (StringUtils.isNotBlank(handlerType)) { + BasicContentHandlerFactory.HANDLER_TYPE type = + BasicContentHandlerFactory.parseHandlerType(handlerType, BasicContentHandlerFactory.HANDLER_TYPE.TEXT); + parseContext.set(BasicContentHandlerFactory.class, + new BasicContentHandlerFactory(type, -1)); + } PipesResult pipesResult = pipesClient.process(new FetchEmitTuple(request.getFetchKey(), new FetchKey(fetcher.getExtensionConfig().id(), request.getFetchKey()), new EmitKey(), tikaMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); FetchAndParseReply.Builder fetchReplyBuilder = diff --git a/tika-grpc/src/main/proto/tika.proto b/tika-grpc/src/main/proto/tika.proto index 70fdf2f0af..d715cca37b 100644 --- a/tika-grpc/src/main/proto/tika.proto +++ b/tika-grpc/src/main/proto/tika.proto @@ -100,6 +100,10 @@ message FetchAndParseRequest { string additional_fetch_config_json = 3; // The ID of the emitter to use (optional). If not provided, no emitter will be used. string emitter_id = 4; + // The content handler type to use for this request, overriding the server default. + // Valid values: "text" (default), "html", "xml", "body", "ignore", "markdown". + // Use "html" to get structured HTML output instead of plain text. + string handler_type = 5; } message FetchAndParseReply {
