This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5463cd6426 TIKA-4727: improve parsemode configuration (#2815)
5463cd6426 is described below
commit 5463cd64269fcc091042bdd100ee05a0594f7b7a
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 14 12:37:17 2026 -0400
TIKA-4727: improve parsemode configuration (#2815)
---
docs/modules/ROOT/pages/pipes/parse-modes.adoc | 7 ++++---
.../tika/pipes/core/server/ConnectionHandler.java | 2 +-
.../apache/tika/pipes/core/server/PipesServer.java | 5 +++--
.../apache/tika/pipes/core/server/PipesWorker.java | 8 +++++++-
.../tika/pipes/core/server/ServerProtocolIO.java | 19 ++++++++++++++-----
.../tika/pipes/core/server/SharedServerResources.java | 1 +
6 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index 2a1af6a593..6e5f47fa4e 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -20,7 +20,8 @@
:toclevels: 3
Tika Pipes uses `ParseMode` to control how documents are parsed and how
results are emitted.
-The parse mode is set on the `ParseContext` or configured in `PipesConfig`.
+The parse mode is configured in the `pipes` section of the JSON config, or
overridden per-request
+in the `parseContext` field of a `FetchEmitTuple`.
== Available Parse Modes
@@ -87,7 +88,7 @@ to all modes that produce content (`RMETA`, `CONCATENATE`,
`CONTENT_ONLY`).
[source,json]
----
{
- "parseContext": {
+ "pipes": {
"parseMode": "CONCATENATE"
}
}
@@ -125,7 +126,7 @@ useful for:
[source,json]
----
{
- "parseContext": {
+ "pipes": {
"parseMode": "CONTENT_ONLY"
}
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
index 3162f0922a..c6f802e516 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ConnectionHandler.java
@@ -230,7 +230,7 @@ public class ConnectionHandler implements Runnable,
Closeable {
resources.getEmitStrategy(), resources.getEmitterManager(),
threshold);
return new PipesWorker(fetchEmitTuple, mergedContext,
resources.getAutoDetectParser(),
resources.getEmitterManager(), fetchHandler, parseHandler,
emitHandler,
- resources.getDefaultMetadataWriteLimiterFactory());
+ resources.getDefaultMetadataWriteLimiterFactory(),
pipesConfig.getParseMode());
}
private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext
mergedContext,
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
index fb7a74551f..2056c6aab7 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java
@@ -403,7 +403,8 @@ public class PipesServer implements AutoCloseable {
long threshold = (thresholdBytes != null) ? thresholdBytes :
EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES;
EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter,
emitStrategy, emitterManager, threshold);
return new PipesWorker(fetchEmitTuple, mergedContext,
autoDetectParser, emitterManager,
- fetchHandler, parseHandler, emitHandler,
defaultMetadataWriteLimiterFactory);
+ fetchHandler, parseHandler, emitHandler,
defaultMetadataWriteLimiterFactory,
+ pipesConfig.getParseMode());
}
private void loopUntilDone(FetchEmitTuple fetchEmitTuple, ParseContext
mergedContext,
@@ -550,7 +551,7 @@ public class PipesServer implements AutoCloseable {
if (mergedContext.get(EmbeddedDocumentExtractorFactory.class) == null)
{
mergedContext.set(EmbeddedDocumentExtractorFactory.class, new
UnpackExtractorFactory());
}
- // Overlay request's values (request takes precedence)
+ // Request-level values override config defaults
mergedContext.copyFrom(requestContext);
return mergedContext;
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index 136853e5d2..a76defc641 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -69,10 +69,12 @@ class PipesWorker implements Callable<PipesResult> {
private final ParseHandler parseHandler;
private final EmitHandler emitHandler;
private final MetadataWriteLimiterFactory
defaultMetadataWriteLimiterFactory;
+ private final ParseMode defaultParseMode;
public PipesWorker(FetchEmitTuple fetchEmitTuple, ParseContext
parseContext, AutoDetectParser autoDetectParser,
EmitterManager emitterManager, FetchHandler
fetchHandler, ParseHandler parseHandler,
- EmitHandler emitHandler, MetadataWriteLimiterFactory
defaultMetadataWriteLimiterFactory) {
+ EmitHandler emitHandler, MetadataWriteLimiterFactory
defaultMetadataWriteLimiterFactory,
+ ParseMode defaultParseMode) {
this.fetchEmitTuple = fetchEmitTuple;
this.parseContext = parseContext;
this.autoDetectParser = autoDetectParser;
@@ -81,6 +83,7 @@ class PipesWorker implements Callable<PipesResult> {
this.parseHandler = parseHandler;
this.emitHandler = emitHandler;
this.defaultMetadataWriteLimiterFactory =
defaultMetadataWriteLimiterFactory;
+ this.defaultParseMode = defaultParseMode;
}
@Override
@@ -607,6 +610,9 @@ class PipesWorker implements Callable<PipesResult> {
}
ParseMode parseMode = parseContext.get(ParseMode.class);
+ if (parseMode == null) {
+ parseMode = defaultParseMode;
+ }
UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class);
// For UNPACK mode, automatically set up byte extraction
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
index 3d71f87457..531db0036f 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ServerProtocolIO.java
@@ -20,6 +20,9 @@ import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -45,6 +48,8 @@ import org.apache.tika.utils.StringUtils;
*/
public class ServerProtocolIO {
+ private static final Logger LOG =
LoggerFactory.getLogger(ServerProtocolIO.class);
+
private final DataInputStream input;
private final DataOutputStream output;
@@ -122,12 +127,16 @@ public class ServerProtocolIO {
UnpackConfig unpackConfig = requestContext.get(UnpackConfig.class);
ParseMode parseMode = requestContext.get(ParseMode.class);
+ // Warn (don't throw) when UnpackConfig has an emitter but ParseMode
is not UNPACK.
+ // The global parse-context may include UnpackConfig as a default for
UNPACK pipe runs,
+ // but the /rmeta and /tika endpoints explicitly set RMETA mode and
PipesWorker correctly
+ // ignores UnpackConfig for non-UNPACK modes. Throwing here would
crash the child process.
if (unpackConfig != null &&
!StringUtils.isBlank(unpackConfig.getEmitter())
- && parseMode != ParseMode.UNPACK) {
- throw new TikaConfigException(
- "FetchEmitTuple has UnpackConfig with emitter '" +
unpackConfig.getEmitter() +
- "' but ParseMode is " + parseMode + ". " +
- "To extract embedded bytes, set ParseMode.UNPACK
in the ParseContext.");
+ && parseMode != null && parseMode != ParseMode.UNPACK) {
+ LOG.warn("FetchEmitTuple has UnpackConfig with emitter '{}' but
ParseMode is {}. "
+ + "UnpackConfig will be ignored. "
+ + "To extract embedded bytes, set ParseMode.UNPACK in the
ParseContext.",
+ unpackConfig.getEmitter(), parseMode);
}
}
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
index e92f3455cd..cb5f8412a6 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/SharedServerResources.java
@@ -158,6 +158,7 @@ public class SharedServerResources {
if (mergedContext.get(EmbeddedDocumentExtractorFactory.class) == null)
{
mergedContext.set(EmbeddedDocumentExtractorFactory.class, new
UnpackExtractorFactory());
}
+ // Request-level values override config defaults
mergedContext.copyFrom(requestContext);
return mergedContext;
}