This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 0b38268d4f TIKA-4735 -- fix content-only (#2826)
0b38268d4f is described below

commit 0b38268d4f4090df6e06888042f0d20e8d34339b
Author: Tim Allison <[email protected]>
AuthorDate: Thu May 21 13:19:22 2026 -0400

    TIKA-4735 -- fix content-only (#2826)
---
 docs/modules/ROOT/pages/pipes/parse-modes.adoc     |  4 +-
 docs/modules/ROOT/pages/using-tika/cli/index.adoc  |  4 +-
 .../main/java/org/apache/tika/cli/AsyncHelper.java | 14 +++---
 .../java/org/apache/tika/cli/AsyncHelperTest.java  | 12 ++---
 .../apache/tika/async/cli/AsyncProcessorTest.java  | 42 +++++++++++++++++
 .../configs/config-content-only-default.json       | 54 ++++++++++++++++++++++
 .../apache/tika/pipes/core/server/PipesWorker.java |  3 ++
 7 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/docs/modules/ROOT/pages/pipes/parse-modes.adoc 
b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
index 9c1bf96860..69ba2204cd 100644
--- a/docs/modules/ROOT/pages/pipes/parse-modes.adoc
+++ b/docs/modules/ROOT/pages/pipes/parse-modes.adoc
@@ -52,7 +52,7 @@ See <<no-parse-mode>>.
 == Content Handler Types
 
 The content handler type determines the format of the extracted text. It is 
set on the
-`ContentHandlerFactory` configured in `parseContext` (or via the CLI `-h` 
flag), and applies
+`ContentHandlerFactory` configured in `parseContext` (or via the CLI 
`--handler` flag), and applies
 to all modes that produce content (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`).
 
 [cols="1,1,2"]
@@ -152,7 +152,7 @@ flag:
 
 [source,bash]
 ----
-java -jar tika-app.jar -i /input -o /output -h m --content-only
+java -jar tika-app.jar -i /input -o /output --handler m --content-only
 ----
 
 This produces `.md` files (when using the `m` handler type) containing only the
diff --git a/docs/modules/ROOT/pages/using-tika/cli/index.adoc 
b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
index e3abc00a3c..f5d15608cf 100644
--- a/docs/modules/ROOT/pages/using-tika/cli/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/cli/index.adoc
@@ -191,7 +191,7 @@ to the output directory.
 |`-o`
 |Output directory
 
-|`-h` or `--handlerType`
+|`--handler`
 |Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, 
`i`=ignore (default: `t`)
 
 |`--concatenate`
@@ -216,7 +216,7 @@ Extract markdown content only (no metadata) from all files:
 
 [source,bash]
 ----
-java -jar tika-app.jar -i /path/to/input -o /path/to/output -h m --content-only
+java -jar tika-app.jar -i /path/to/input -o /path/to/output --handler m 
--content-only
 ----
 
 This produces `.md` files in the output directory containing just the 
extracted markdown
diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java 
b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
index e3561ecf5f..2320fa7df5 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java
@@ -59,17 +59,17 @@ public class AsyncHelper {
             } else if (arg.equals(UNPACK_INCLUDE_METADATA)) {
                 argList.add("--unpack-include-metadata");
             } else if (arg.equals("-t") || arg.equals("--text")) {
-                // Translate TikaCLI text output to TikaAsyncCLI handler type
-                argList.add("-h");
+                // Translate TikaCLI text output to the TikaAsyncCLI handler 
type.
+                // TikaAsyncCLI's handler option is --handler; -h there means 
--help.
+                argList.add("--handler");
                 argList.add("t");
             } else if (arg.equals("--html")) {
-                // Translate TikaCLI html output to TikaAsyncCLI handler type
-                // Note: TikaCLI uses -h for html, but TikaAsyncCLI uses -h 
for handler type
-                argList.add("-h");
+                // Translate TikaCLI html output to the TikaAsyncCLI handler 
type.
+                argList.add("--handler");
                 argList.add("h");
             } else if (arg.equals("-x") || arg.equals("--xml")) {
-                // Translate TikaCLI xml output to TikaAsyncCLI handler type
-                argList.add("-h");
+                // Translate TikaCLI xml output to the TikaAsyncCLI handler 
type.
+                argList.add("--handler");
                 argList.add("x");
             } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
                 // TikaAsyncCLI always outputs JSON with recursive metadata 
(RMETA mode)
diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
index a26f247500..f1a3b79864 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java
@@ -32,28 +32,28 @@ public class AsyncHelperTest {
     @Test
     public void testTextHandler() throws Exception {
         String[] args = new String[]{"-t", "input", "output"};
-        String[] expected = new String[]{"-h", "t", "input", "output"};
+        String[] expected = new String[]{"--handler", "t", "input", "output"};
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 
     @Test
     public void testTextHandlerLong() throws Exception {
         String[] args = new String[]{"--text", "input", "output"};
-        String[] expected = new String[]{"-h", "t", "input", "output"};
+        String[] expected = new String[]{"--handler", "t", "input", "output"};
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 
     @Test
     public void testHtmlHandler() throws Exception {
         String[] args = new String[]{"--html", "input", "output"};
-        String[] expected = new String[]{"-h", "h", "input", "output"};
+        String[] expected = new String[]{"--handler", "h", "input", "output"};
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 
     @Test
     public void testXmlHandler() throws Exception {
         String[] args = new String[]{"-x", "input", "output"};
-        String[] expected = new String[]{"-h", "x", "input", "output"};
+        String[] expected = new String[]{"--handler", "x", "input", "output"};
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 
@@ -61,14 +61,14 @@ public class AsyncHelperTest {
     public void testJsonRecursiveSkipped() throws Exception {
         // -J is the default in async mode, so it's just skipped
         String[] args = new String[]{"-J", "-t", "input", "output"};
-        String[] expected = new String[]{"-h", "t", "input", "output"};
+        String[] expected = new String[]{"--handler", "t", "input", "output"};
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 
     @Test
     public void testBatchModeWithOptions() throws Exception {
         String[] args = new String[]{"-J", "-t", "/path/to/input", 
"/path/to/output"};
-        String[] expected = new String[]{"-h", "t", "/path/to/input", 
"/path/to/output"};
+        String[] expected = new String[]{"--handler", "t", "/path/to/input", 
"/path/to/output"};
         assertArrayEquals(expected, AsyncHelper.translateArgs(args));
     }
 }
diff --git 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
index 585bf4b905..17fc05d2b7 100644
--- 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
+++ 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.async.cli;
 
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -159,6 +160,47 @@ public class AsyncProcessorTest extends TikaTest {
                 .get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testContentOnlyFromConfigDefault() throws Exception {
+        // TIKA-4735: parseMode set only as a PipesConfig default (not on the 
request
+        // context) must still be honored at emit time - the file should be 
raw content,
+        // not a JSON metadata wrapper.
+        Path contentOnlyConfig = 
configDir.resolve("tika-config-content-only.json");
+        Map<String, Object> replacements = new HashMap<>();
+        replacements.put("FETCHER_BASE_PATH", inputDir);
+        replacements.put("JSON_EMITTER_BASE_PATH", jsonOutputDir);
+        replacements.put("BYTES_EMITTER_BASE_PATH", bytesOutputDir);
+        replacements.put("PLUGIN_ROOTS", Paths.get("target/plugins"));
+        
JsonConfigHelper.writeConfigFromResource("/configs/config-content-only-default.json",
+                AsyncProcessorTest.class, replacements, contentOnlyConfig);
+
+        AsyncProcessor processor = AsyncProcessor.load(contentOnlyConfig);
+
+        // Deliberately do NOT set ParseMode on the request context - it must 
come from
+        // the config default.
+        FetchEmitTuple t = new FetchEmitTuple("co-1", new FetchKey("fsf", 
"mock.xml"),
+                new EmitKey("fse-json", "emit-co"), new Metadata(), new 
ParseContext(),
+                FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
+        processor.offer(t, 1000);
+        for (int i = 0; i < 10; i++) {
+            processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000);
+        }
+        while (processor.checkActive()) {
+            Thread.sleep(100);
+        }
+        processor.close();
+
+        String emitted = Files.readString(jsonOutputDir.resolve("emit-co"));
+        // Raw concatenated content (markdown may escape underscores), not a 
JSON wrapper.
+        assertContains("content", emitted);
+        assertContains("some", emitted);
+        
assertFalse(emitted.contains(TikaCoreProperties.TIKA_CONTENT.getName()),
+                "content-only output must not contain the JSON content key: " 
+ emitted);
+        String trimmed = emitted.trim();
+        assertFalse(trimmed.startsWith("[") || trimmed.startsWith("{"),
+                "content-only output must be raw content, not a JSON wrapper: 
" + emitted);
+    }
+
     @Test
     public void testStopsOnApplicationError() throws Exception {
         AsyncProcessor processor = 
AsyncProcessor.load(configDir.resolve("tika-config.json"));
diff --git 
a/tika-pipes/tika-async-cli/src/test/resources/configs/config-content-only-default.json
 
b/tika-pipes/tika-async-cli/src/test/resources/configs/config-content-only-default.json
new file mode 100644
index 0000000000..7b7849a7b5
--- /dev/null
+++ 
b/tika-pipes/tika-async-cli/src/test/resources/configs/config-content-only-default.json
@@ -0,0 +1,54 @@
+{
+  "fetchers": {
+    "fsf": {
+      "file-system-fetcher": {
+        "basePath": "FETCHER_BASE_PATH",
+        "extractFileSystemMetadata": false
+      }
+    }
+  },
+  "emitters": {
+    "fse-json": {
+      "file-system-emitter": {
+        "basePath": "JSON_EMITTER_BASE_PATH",
+        "fileExtension": "",
+        "onExists": "EXCEPTION"
+      }
+    },
+    "fse-bytes": {
+      "file-system-emitter": {
+        "basePath": "BYTES_EMITTER_BASE_PATH",
+        "fileExtension": "",
+        "onExists": "EXCEPTION"
+      }
+    }
+  },
+  "parse-context": {
+    "timeout-limits": {
+      "progressTimeoutMillis": 60000
+    }
+  },
+  "pipes": {
+    "parseMode": "CONTENT_ONLY",
+    "emitWithinMillis": 10000,
+    "queueSize": 10000,
+    "numEmitters": 1,
+    "emitIntermediateResults": false,
+    "startupTimeoutMillis": 240000,
+    "sleepOnStartupTimeoutMillis": 240000,
+    "shutdownClientAfterMillis": 300000,
+    "numClients": 2,
+    "maxFilesProcessedPerProcess": 10000,
+    "staleFetcherTimeoutSeconds": 600,
+    "staleFetcherDelaySeconds": 60,
+    "forkedJvmArgs": [
+      "-Xmx1g",
+      "-XX:+UseG1GC"
+    ],
+    "javaPath": "java",
+    "emitStrategy": {
+      "type": "EMIT_ALL"
+    }
+  },
+  "plugin-roots": "PLUGIN_ROOTS"
+}
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
index b60ed056c3..9354517c7a 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java
@@ -530,7 +530,10 @@ class PipesWorker implements Callable<PipesResult> {
 
         ParseMode parseMode = parseContext.get(ParseMode.class);
         if (parseMode == null) {
+            // Write the resolved default back so EmitHandler sees it too - it 
reads ParseMode
+            // only from the context, with no fallback to the PipesConfig 
default (TIKA-4735).
             parseMode = defaultParseMode;
+            parseContext.set(ParseMode.class, parseMode);
         }
         UnpackConfig unpackConfig = parseContext.get(UnpackConfig.class);
 

Reply via email to