This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 0b38268d4f TIKA-4735 -- fix content-only (#2826)
0b38268d4f is described below
commit 0b38268d4f4090df6e06888042f0d20e8d34339b
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 21 13:19:22 2026 -0400
TIKA-4735 -- fix content-only (#2826)
---
docs/modules/ROOT/pages/pipes/parse-modes.adoc | 4 +-
docs/modules/ROOT/pages/using-tika/cli/index.adoc | 4 +-
.../main/java/org/apache/tika/cli/AsyncHelper.java | 14 +++---
.../java/org/apache/tika/cli/AsyncHelperTest.java | 12 ++---
.../apache/tika/async/cli/AsyncProcessorTest.java | 42 +++++++++++++++++
.../configs/config-content-only-default.json | 54 ++++++++++++++++++++++
.../apache/tika/pipes/core/server/PipesWorker.java | 3 ++
7 files changed, 116 insertions(+), 17 deletions(-)
diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index 9c1bf96860..69ba2204cd 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -52,7 +52,7 @@ See <<no-parse-mode>>.
== Content Handler Types
The content handler type determines the format of the extracted text. It is
set on the
-`ContentHandlerFactory` configured in `parseContext` (or via the CLI `-h`
flag), and applies
+`ContentHandlerFactory` configured in `parseContext` (or via the CLI
`--handler` flag), and applies
to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`).
[cols="1,1,2"]
@@ -152,7 +152,7 @@ flag:
[source,bash]
----
-java -jar tika-app.jar -i /input -o /output -h m --content-only
+java -jar tika-app.jar -i /input -o /output --handler m --content-only
----
This produces `.md` files (when using the `m` handler type) containing only the
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index e3abc00a3c..f5d15608cf 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -191,7 +191,7 @@ to the output directory.
|`-o`
|Output directory
-|`-h` or `--handlerType`
+|`--handler`
|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body,
`i`=ignore (default: `t`)
|`--concatenate`
@@ -216,7 +216,7 @@ Extract markdown content only (no metadata) from all files:
[source,bash]
----
-java -jar tika-app.jar -i /path/to/input -o /path/to/output -h m --content-only
+java -jar tika-app.jar -i /path/to/input -o /path/to/output --handler m
--content-only
----
This produces `.md` files in the output directory containing just the
extracted markdown
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index e3561ecf5f..2320fa7df5 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -59,17 +59,17 @@ public class AsyncHelper {
} else if (arg.equals(UNPACK_INCLUDE_METADATA)) {
argList.add("--unpack-include-metadata");
} else if (arg.equals("-t") || arg.equals("--text")) {
- // Translate TikaCLI text output to TikaAsyncCLI handler type
- argList.add("-h");
+ // Translate TikaCLI text output to the TikaAsyncCLI handler
type.
+ // TikaAsyncCLI's handler option is --handler; -h there means
--help.
+ argList.add("--handler");
argList.add("t");
} else if (arg.equals("--html")) {
- // Translate TikaCLI html output to TikaAsyncCLI handler type
- // Note: TikaCLI uses -h for html, but TikaAsyncCLI uses -h
for handler type
- argList.add("-h");
+ // Translate TikaCLI html output to the TikaAsyncCLI handler
type.
+ argList.add("--handler");
argList.add("h");
} else if (arg.equals("-x") || arg.equals("--xml")) {
- // Translate TikaCLI xml output to TikaAsyncCLI handler type
- argList.add("-h");
+ // Translate TikaCLI xml output to the TikaAsyncCLI handler
type.
+ argList.add("--handler");
argList.add("x");
} else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
// TikaAsyncCLI always outputs JSON with recursive metadata
(RMETA mode)
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
index a26f247500..f1a3b79864 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -32,28 +32,28 @@ public class AsyncHelperTest {
@Test
public void testTextHandler() throws Exception {
String[] args = new String[]{"-t", "input", "output"};
- String[] expected = new String[]{"-h", "t", "input", "output"};
+ String[] expected = new String[]{"--handler", "t", "input", "output"};
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
@Test
public void testTextHandlerLong() throws Exception {
String[] args = new String[]{"--text", "input", "output"};
- String[] expected = new String[]{"-h", "t", "input", "output"};
+ String[] expected = new String[]{"--handler", "t", "input", "output"};
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
@Test
public void testHtmlHandler() throws Exception {
String[] args = new String[]{"--html", "input", "output"};
- String[] expected = new String[]{"-h", "h", "input", "output"};
+ String[] expected = new String[]{"--handler", "h", "input", "output"};
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
@Test
public void testXmlHandler() throws Exception {
String[] args = new String[]{"-x", "input", "output"};
- String[] expected = new String[]{"-h", "x", "input", "output"};
+ String[] expected = new String[]{"--handler", "x", "input", "output"};
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
@@ -61,14 +61,14 @@ public class AsyncHelperTest {
public void testJsonRecursiveSkipped() throws Exception {
// -J is the default in async mode, so it's just skipped
String[] args = new String[]{"-J", "-t", "input", "output"};
- String[] expected = new String[]{"-h", "t", "input", "output"};
+ String[] expected = new String[]{"--handler", "t", "input", "output"};
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
@Test
public void testBatchModeWithOptions() throws Exception {
String[] args = new String[]{"-J", "-t", "/path/to/input",
"/path/to/output"};
- String[] expected = new String[]{"-h", "t", "/path/to/input",
"/path/to/output"};
+ String[] expected = new String[]{"--handler", "t", "/path/to/input",
"/path/to/output"};
assertArrayEquals(expected, AsyncHelper.translateArgs(args));
}
}
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
index 585bf4b905..17fc05d2b7 100644
---
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.async.cli;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -159,6 +160,47 @@ public class AsyncProcessorTest extends TikaTest {
.get(TikaCoreProperties.TIKA_CONTENT));
}
+ @Test
+ public void testContentOnlyFromConfigDefault() throws Exception {
+ // TIKA-4735: parseMode set only as a PipesConfig default (not on the
request
+ // context) must still be honored at emit time - the file should be
raw content,
+ // not a JSON metadata wrapper.
+ Path contentOnlyConfig =
configDir.resolve("tika-config-content-only.json");
+ Map<String, Object> replacements = new HashMap<>();
+ replacements.put("FETCHER_BASE_PATH", inputDir);
+ replacements.put("JSON_EMITTER_BASE_PATH", jsonOutputDir);
+ replacements.put("BYTES_EMITTER_BASE_PATH", bytesOutputDir);
+ replacements.put("PLUGIN_ROOTS", Paths.get("target/plugins"));
+
JsonConfigHelper.writeConfigFromResource("/configs/config-content-only-default.json",
+ AsyncProcessorTest.class, replacements, contentOnlyConfig);
+
+ AsyncProcessor processor = AsyncProcessor.load(contentOnlyConfig);
+
+ // Deliberately do NOT set ParseMode on the request context - it must
come from
+ // the config default.
+ FetchEmitTuple t = new FetchEmitTuple("co-1", new FetchKey("fsf",
"mock.xml"),
+ new EmitKey("fse-json", "emit-co"), new Metadata(), new
ParseContext(),
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
+ processor.offer(t, 1000);
+ for (int i = 0; i < 10; i++) {
+ processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000);
+ }
+ while (processor.checkActive()) {
+ Thread.sleep(100);
+ }
+ processor.close();
+
+ String emitted = Files.readString(jsonOutputDir.resolve("emit-co"));
+ // Raw concatenated content (markdown may escape underscores), not a
JSON wrapper.
+ assertContains("content", emitted);
+ assertContains("some", emitted);
+
assertFalse(emitted.contains(TikaCoreProperties.TIKA_CONTENT.getName()),
+ "content-only output must not contain the JSON content key: "
+ emitted);
+ String trimmed = emitted.trim();
+ assertFalse(trimmed.startsWith("[") || trimmed.startsWith("{"),
+ "content-only output must be raw content, not a JSON wrapper:
" + emitted);
+ }
+
@Test
public void testStopsOnApplicationError() throws Exception {
AsyncProcessor processor =
AsyncProcessor.load(configDir.resolve("tika-config.json"));
diff --git
a/tika-pipes/tika-async-cli/src/test/resources/configs/config-content-only-default.json
b/tika-pipes/tika-async-cli/src/test/resources/configs/config-content-only-default.json
new file mode 100644
index 0000000000..7b7849a7b5
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/test/resources/configs/config-content-only-default.json
@@ -0,0 +1,54 @@
+{
+ "fetchers": {
+ "fsf": {
+ "file-system-fetcher": {
+ "basePath": "FETCHER_BASE_PATH",
+ "extractFileSystemMetadata": false
+ }
+ }
+ },
+ "emitters": {
+ "fse-json": {
+ "file-system-emitter": {
+ "basePath": "JSON_EMITTER_BASE_PATH",
+ "fileExtension": "",
+ "onExists": "EXCEPTION"
+ }
+ },
+ "fse-bytes": {
+ "file-system-emitter": {
+ "basePath": "BYTES_EMITTER_BASE_PATH",
+ "fileExtension": "",
+ "onExists": "EXCEPTION"
+ }
+ }
+ },
+ "parse-context": {
+ "timeout-limits": {
+ "progressTimeoutMillis": 60000
+ }
+ },
+ "pipes": {
+ "parseMode": "CONTENT_ONLY",
+ "emitWithinMillis": 10000,
+ "queueSize": 10000,
+ "numEmitters": 1,
+ "emitIntermediateResults": false,
+ "startupTimeoutMillis": 240000,
+ "sleepOnStartupTimeoutMillis": 240000,
+ "shutdownClientAfterMillis": 300000,
+ "numClients": 2,
+ "maxFilesProcessedPerProcess": 10000,
+ "staleFetcherTimeoutSeconds": 600,
+ "staleFetcherDelaySeconds": 60,
+ "forkedJvmArgs": [
+ "-Xmx1g",
+ "-XX:+UseG1GC"
+ ],
+ "javaPath": "java",
+ "emitStrategy": {
+ "type": "EMIT_ALL"
+ }
+ },
+ "plugin-roots": "PLUGIN_ROOTS"
+}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index b60ed056c3..9354517c7a 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -530,7 +530,10 @@ class PipesWorker implements Callable<PipesResult> {
ParseMode parseMode = parseContext.get(ParseMode.class);
if (parseMode == null) {
+ // Write the resolved default back so EmitHandler sees it too - it
reads ParseMode
+ // only from the context, with no fallback to the PipesConfig
default (TIKA-4735).
parseMode = defaultParseMode;
+ parseContext.set(ParseMode.class, parseMode);
}
UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class);