This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch haystack-pipes-parsemode
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7b1b698e6680d5bb24b31e1651398b15270633cd
Author: tballison <[email protected]>
AuthorDate: Wed May 13 12:44:49 2026 -0400

    improve parsemode configuration
---
 docs/modules/ROOT/pages/pipes/parse-modes.adoc        |  7 ++++---
 .../tika/pipes/core/server/ConnectionHandler.java     |  2 +-
 .../apache/tika/pipes/core/server/PipesServer.java    |  6 ++++--
 .../apache/tika/pipes/core/server/PipesWorker.java    |  8 +++++++-
 .../tika/pipes/core/server/ServerProtocolIO.java      | 19 ++++++++++++++-----
 .../tika/pipes/core/server/SharedServerResources.java |  1 +
 tika-pipes/tika-pipes-fork-parser/pom.xml             | 11 +++++++++++
 7 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc 
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index 2a1af6a593..6e5f47fa4e 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -20,7 +20,8 @@
 :toclevels: 3
 
 Tika Pipes uses `ParseMode` to control how documents are parsed and how 
results are emitted.
-The parse mode is set on the `ParseContext` or configured in `PipesConfig`.
+The parse mode is configured in the `pipes` section of the JSON config, or 
overridden per-request
+in the `parseContext` field of a `FetchEmitTuple`.
 
 == Available Parse Modes
 
@@ -87,7 +88,7 @@ to all modes that produce content (`RMETA`, `CONCATENATE`, 
`CONTENT_ONLY`).
 [source,json]
 ----
 {
-  "parseContext": {
+  "pipes": {
     "parseMode": "CONCATENATE"
   }
 }
@@ -125,7 +126,7 @@ useful for:
 [source,json]
 ----
 {
-  "parseContext": {
+  "pipes": {
     "parseMode": "CONTENT_ONLY"
   }
 }
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
index 3162f0922a..c6f802e516 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
@@ -230,7 +230,7 @@ public class ConnectionHandler implements Runnable, 
Closeable {
                 resources.getEmitStrategy(), resources.getEmitterManager(), 
threshold);
         return new PipesWorker(fetchEmitTuple, mergedContext, 
resources.getAutoDetectParser(),
                 resources.getEmitterManager(), fetchHandler, parseHandler, 
emitHandler,
-                resources.getDefaultMetadataWriteLimiterFactory());
+                resources.getDefaultMetadataWriteLimiterFactory(), 
pipesConfig.getParseMode());
     }
 
     private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext 
mergedContext,
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
index fb7a74551f..1fd9df1a2a 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
@@ -61,6 +61,7 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.pipes.api.FetchEmitTuple;
+import org.apache.tika.pipes.api.ParseMode;
 import org.apache.tika.pipes.api.PipesResult;
 import org.apache.tika.pipes.core.EmitStrategy;
 import org.apache.tika.pipes.core.EmitStrategyConfig;
@@ -403,7 +404,8 @@ public class PipesServer implements AutoCloseable {
         long threshold = (thresholdBytes != null) ? thresholdBytes : 
EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES;
         EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter, 
emitStrategy, emitterManager, threshold);
         return new PipesWorker(fetchEmitTuple, mergedContext, 
autoDetectParser, emitterManager,
-                fetchHandler, parseHandler, emitHandler, 
defaultMetadataWriteLimiterFactory);
+                fetchHandler, parseHandler, emitHandler, 
defaultMetadataWriteLimiterFactory,
+                pipesConfig.getParseMode());
     }
 
     private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext 
mergedContext,
@@ -550,7 +552,7 @@ public class PipesServer implements AutoCloseable {
         if (mergedContext.get(EmbeddedDocumentExtractorFactory.class) == null) 
{
             mergedContext.set(EmbeddedDocumentExtractorFactory.class, new 
UnpackExtractorFactory());
         }
-        // Overlay request's values (request takes precedence)
+        // Request-level values override config defaults
         mergedContext.copyFrom(requestContext);
         return mergedContext;
     }
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index 136853e5d2..a76defc641 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -69,10 +69,12 @@ class PipesWorker implements Callable<PipesResult> {
     private final ParseHandler parseHandler;
     private final EmitHandler emitHandler;
     private final MetadataWriteLimiterFactory 
defaultMetadataWriteLimiterFactory;
+    private final ParseMode defaultParseMode;
 
     public PipesWorker(FetchEmitTuple fetchEmitTuple, ParseContext 
parseContext, AutoDetectParser autoDetectParser,
                        EmitterManager emitterManager, FetchHandler 
fetchHandler, ParseHandler parseHandler,
-                       EmitHandler emitHandler, MetadataWriteLimiterFactory 
defaultMetadataWriteLimiterFactory) {
+                       EmitHandler emitHandler, MetadataWriteLimiterFactory 
defaultMetadataWriteLimiterFactory,
+                       ParseMode defaultParseMode) {
         this.fetchEmitTuple = fetchEmitTuple;
         this.parseContext = parseContext;
         this.autoDetectParser = autoDetectParser;
@@ -81,6 +83,7 @@ class PipesWorker implements Callable<PipesResult> {
         this.parseHandler = parseHandler;
         this.emitHandler = emitHandler;
         this.defaultMetadataWriteLimiterFactory = 
defaultMetadataWriteLimiterFactory;
+        this.defaultParseMode = defaultParseMode;
     }
 
     @Override
@@ -607,6 +610,9 @@ class PipesWorker implements Callable<PipesResult> {
         }
 
         ParseMode parseMode = parseContext.get(ParseMode.class);
+        if (parseMode == null) {
+            parseMode = defaultParseMode;
+        }
         UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class);
 
         // For UNPACK mode, automatically set up byte extraction
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
index 3d71f87457..531db0036f 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
@@ -20,6 +20,9 @@ import java.io.DataInputStream;
 import java.io.DataOutputStream;
 import java.io.IOException;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -45,6 +48,8 @@ import org.apache.tika.utils.StringUtils;
  */
 public class ServerProtocolIO {
 
+    private static final Logger LOG = 
LoggerFactory.getLogger(ServerProtocolIO.class);
+
     private final DataInputStream input;
     private final DataOutputStream output;
 
@@ -122,12 +127,16 @@ public class ServerProtocolIO {
         UnpackConfig unpackConfig = requestContext.get(UnpackConfig.class);
         ParseMode parseMode = requestContext.get(ParseMode.class);
 
+        // Warn (don't throw) when UnpackConfig has an emitter but ParseMode 
is not UNPACK.
+        // The global parse-context may include UnpackConfig as a default for 
UNPACK pipe runs,
+        // but the /rmeta and /tika endpoints explicitly set RMETA mode and 
PipesWorker correctly
+        // ignores UnpackConfig for non-UNPACK modes. Throwing here would 
crash the child process.
         if (unpackConfig != null && 
!StringUtils.isBlank(unpackConfig.getEmitter())
-                && parseMode != ParseMode.UNPACK) {
-            throw new TikaConfigException(
-                    "FetchEmitTuple has UnpackConfig with emitter '" + 
unpackConfig.getEmitter() +
-                            "' but ParseMode is " + parseMode + ". " +
-                            "To extract embedded bytes, set ParseMode.UNPACK 
in the ParseContext.");
+                && parseMode != null && parseMode != ParseMode.UNPACK) {
+            LOG.warn("FetchEmitTuple has UnpackConfig with emitter '{}' but 
ParseMode is {}. "
+                    + "UnpackConfig will be ignored. "
+                    + "To extract embedded bytes, set ParseMode.UNPACK in the 
ParseContext.",
+                    unpackConfig.getEmitter(), parseMode);
         }
     }
 }
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
index e92f3455cd..cb5f8412a6 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
@@ -158,6 +158,7 @@ public class SharedServerResources {
         if (mergedContext.get(EmbeddedDocumentExtractorFactory.class) == null) 
{
             mergedContext.set(EmbeddedDocumentExtractorFactory.class, new 
UnpackExtractorFactory());
         }
+        // Request-level values override config defaults
         mergedContext.copyFrom(requestContext);
         return mergedContext;
     }
diff --git a/tika-pipes/tika-pipes-fork-parser/pom.xml 
b/tika-pipes/tika-pipes-fork-parser/pom.xml
index fd5856aa6a..8d381e9ed3 100644
--- a/tika-pipes/tika-pipes-fork-parser/pom.xml
+++ b/tika-pipes/tika-pipes-fork-parser/pom.xml
@@ -59,6 +59,17 @@
       <version>${project.version}</version>
       <type>pom</type>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-langdetect-charsoup</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <!-- tika-eval: TikaEvalMetadataFilter (tokens, OOV, lang, languageness) 
-->
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-eval-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-core</artifactId>

Reply via email to