This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 327eda16fb TIKA-4753 - improve oom/timeout/crash msg (#2870)
327eda16fb is described below
commit 327eda16fbe7f9eea3a959b5db36ff60c3946ed8
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 5 12:47:38 2026 -0400
TIKA-4753 - improve oom/timeout/crash msg (#2870)
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
.../migration-to-4x/migrating-tika-server-4x.adoc | 23 +++++++
.../ROOT/pages/using-tika/server/index.adoc | 39 +++++++++++
.../apache/tika/server/core/TikaServerProcess.java | 2 +-
.../server/core/resource/PipesParsingHelper.java | 75 ++++++++++++++++------
.../org/apache/tika/server/core/CXFTestBase.java | 2 +-
.../server/core/TikaServerIntegrationTest.java | 28 ++++++--
6 files changed, 142 insertions(+), 27 deletions(-)
diff --git
a/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
b/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
index bd30d5a873..4a18868ec4 100644
--- a/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
+++ b/docs/modules/ROOT/pages/migration-to-4x/migrating-tika-server-4x.adoc
@@ -89,6 +89,29 @@ The separate `/config` endpoints have been removed.
Configuration is now handled
**Migration:** Use `POST /tika` or `POST /tika/json` with a `config` part in
your multipart request.
+=== Error Response Bodies Are Now JSON
+
+In 3.x, error responses from `/tika`, `/rmeta`, and `/unpack` returned a
plain-text
+body such as "Parse failed: TIMEOUT". In 4.x these endpoints return a JSON
body with
+at least a `status` field:
+
+[source,json]
+----
+{"status": "TIMEOUT"}
+----
+
+When the server is configured with `returnStackTrace=true`, a `message` field
is also
+included (it may contain a server-side stack trace), e.g. `{"status":
"TIMEOUT", "message": "Task timed out after 60000ms"}`.
+
+The HTTP status codes are also more precise:
+
+* `UNSPECIFIED_CRASH` changed from `500` to `503` — it is a transient process
failure
+ in the same category as `TIMEOUT` and `OOM`, not a server misconfiguration.
+
+**Migration:** Clients that parse plain-text error bodies must switch to JSON.
Clients
+that branch only on HTTP status code are unaffected unless they were treating
+`UNSPECIFIED_CRASH` as a `500`.
+
=== Accept Header Routing Removed
The `/tika` endpoint no longer routes based on `Accept` headers. Use explicit
paths instead:
diff --git a/docs/modules/ROOT/pages/using-tika/server/index.adoc
b/docs/modules/ROOT/pages/using-tika/server/index.adoc
index f9edd5f9d1..e630f97887 100644
--- a/docs/modules/ROOT/pages/using-tika/server/index.adoc
+++ b/docs/modules/ROOT/pages/using-tika/server/index.adoc
@@ -156,6 +156,45 @@ curl -T document.pdf
http://localhost:9998/meta/Content-Type # single field
* `/translate/all/\{translator}/\{src}/\{dest}` — translation
* `/pipes`, `/async` — Pipes-based bulk processing
+== Error Responses
+
+When parsing fails due to a process-level problem — the forked child process
timed out,
+ran out of memory, or crashed unexpectedly — the server returns an HTTP error
with a
+JSON body whose shape matches the `PipesResult` status:
+
+[source,json]
+----
+{"status": "TIMEOUT"}
+----
+
+The `status` field is the `PipesResult.RESULT_STATUS` enum name. By default
the body
+carries only the `status`. When the server is configured with
`returnStackTrace=true`,
+a `message` field is also included (it often contains a server-side stack
trace), e.g.
+`{"status": "TIMEOUT", "message": "Task timed out after 60000ms"}`.
+
+[cols="1,1,3"]
+|===
+|HTTP status |`status` values |Meaning
+
+|`503 Service Unavailable`
+|`TIMEOUT`, `OOM`, `UNSPECIFIED_CRASH`, `CLIENT_UNAVAILABLE_WITHIN_MS`
+|The forked parse process failed, or no parse client became available within
the
+configured wait time (`CLIENT_UNAVAILABLE_WITHIN_MS`). The server is still
healthy;
+the client may retry.
+
+|`500 Internal Server Error`
+|`FAILED_TO_INITIALIZE`, `FETCH_EXCEPTION`, `EMIT_EXCEPTION`,
+`FETCHER_NOT_FOUND`, `EMITTER_NOT_FOUND`,
+`FETCHER_INITIALIZATION_EXCEPTION`, `EMITTER_INITIALIZATION_EXCEPTION`
+|Server misconfiguration or a task-level infrastructure error. Retrying the
same
+document on the same server is unlikely to succeed without a configuration fix.
+|===
+
+NOTE: A successful parse that encountered internal parser errors (e.g. a
truncated
+embedded document) still returns `200 OK`. The partial-parse exception is
surfaced
+in the `X-TIKA:EXCEPTION:container_exception` metadata field of the response,
not as an
+HTTP error code.
+
== Configuration
Server behavior beyond host/port is controlled by a JSON config file passed via
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
index 36685f112a..0a31493e25 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
@@ -510,7 +510,7 @@ public class TikaServerProcess {
// Create and return the helper
PipesParsingHelper helper = new PipesParsingHelper(pipesParser,
pipesConfig,
- inputTempDirectory, unpackTempDirectory);
+ inputTempDirectory, unpackTempDirectory,
tikaServerConfig.isReturnStackTrace());
// Register shutdown hook to clean up PipesParser and temp directories
final Path inputDirToClean = inputTempDirectory;
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
index 15156e0b5c..1fc81a557b 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/PipesParsingHelper.java
@@ -24,7 +24,10 @@ import java.util.Collections;
import java.util.List;
import java.util.UUID;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
import jakarta.ws.rs.WebApplicationException;
+import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -67,6 +70,7 @@ public class PipesParsingHelper {
private final PipesConfig pipesConfig;
private final Path inputTempDirectory;
private final Path unpackEmitterBasePath;
+ private final boolean returnStackTrace;
/**
* Creates a PipesParsingHelper.
@@ -78,13 +82,19 @@ public class PipesParsingHelper {
* @param unpackEmitterBasePath the basePath where the unpack-emitter
writes files.
* This is where the server will find the zip
files created
* by UNPACK mode. May be null if UNPACK mode
won't be used.
+ * @param returnStackTrace whether failure responses may include the
(potentially
+ * stack-trace-bearing) {@code PipesResult}
message. When false
+ * (the default), error bodies carry only the
status. Mirrors
+ * {@code TikaServerConfig.isReturnStackTrace()}.
*/
public PipesParsingHelper(PipesParser pipesParser, PipesConfig pipesConfig,
- Path inputTempDirectory, Path
unpackEmitterBasePath) {
+ Path inputTempDirectory, Path
unpackEmitterBasePath,
+ boolean returnStackTrace) {
this.pipesParser = pipesParser;
this.pipesConfig = pipesConfig;
this.inputTempDirectory = inputTempDirectory;
this.unpackEmitterBasePath = unpackEmitterBasePath;
+ this.returnStackTrace = returnStackTrace;
if (inputTempDirectory == null ||
!Files.isDirectory(inputTempDirectory)) {
throw new IllegalArgumentException(
@@ -184,33 +194,60 @@ public class PipesParsingHelper {
return ".tmp";
}
+ /**
+ * Builds a JSON error response carrying a subset of the {@code
PipesResult}
+ * serialization. By default the body is just {@code {"status":
"TIMEOUT"}}. The
+ * {@code PipesResult} message frequently contains a server-side stack
trace
+ * (e.g. for {@code *_EXCEPTION} statuses), so the {@code message} field
is included
+ * only when {@code returnStackTrace} is enabled — matching the legacy
+ * {@code TikaServerParseExceptionMapper}, which gates stack traces the
same way.
+ * Successful-parse fields such as {@code emitData} are never part of an
error body.
+ * <p>
+ * This allows clients to distinguish failure modes (TIMEOUT, OOM,
UNSPECIFIED_CRASH, …)
+ * without parsing plain-text bodies or inspecting custom headers.
+ */
+ private Response buildProcessFailureResponse(PipesResult result) {
+ ObjectMapper mapper = new ObjectMapper();
+ ObjectNode node = mapper.createObjectNode();
+ node.put("status", result.status().name());
+ if (returnStackTrace && result.message() != null &&
!result.message().isBlank()) {
+ node.put("message", result.message());
+ }
+ String json;
+ try {
+ json = mapper.writeValueAsString(node);
+ } catch (Exception e) {
+ LOG.warn("Failed to serialize PipesResult error response as JSON;
falling back to status-only body", e);
+ json = "{\"status\":\"" + result.status().name() + "\"}";
+ }
+ return Response.status(mapStatusToHttpResponse(result.status()))
+ .entity(json)
+ .type(MediaType.APPLICATION_JSON)
+ .build();
+ }
+
/**
* Processes the PipesResult and returns the metadata list.
*/
private List<Metadata> processResult(PipesResult result) {
if (result.isProcessCrash()) {
- // Process crashed (OOM, timeout, etc.) - return 503
+ // Process crashed (OOM, timeout, unspecified crash) — 503 with
JSON status body
LOG.warn("Parse process crashed: {}", result.status());
- throw new WebApplicationException(
- "Parse failed: " + result.status(),
- mapStatusToHttpResponse(result.status()));
+ throw new
WebApplicationException(buildProcessFailureResponse(result));
}
if (result.isFatal() || result.isInitializationFailure()) {
- // Fatal or initialization error - return 500
+ // Initialization/fatal error — JSON status body, HTTP status per
mapStatusToHttpResponse
+ // (500, or 503 for CLIENT_UNAVAILABLE_WITHIN_MS)
LOG.error("Parse initialization/fatal error: {} - {}",
result.status(), result.message());
- throw new WebApplicationException(
- "Parse failed: " + result.status(),
- mapStatusToHttpResponse(result.status()));
+ throw new
WebApplicationException(buildProcessFailureResponse(result));
}
if (result.isTaskException()) {
- // Task-level exception (fetch/emit error) - return 500
+ // Task-level exception (fetch/emit error) — 500 with JSON status
body
LOG.warn("Parse task exception: {} - {}", result.status(),
result.message());
- throw new WebApplicationException(
- "Parse failed: " + result.status(),
- Response.Status.INTERNAL_SERVER_ERROR);
+ throw new
WebApplicationException(buildProcessFailureResponse(result));
}
// Get metadata from result
@@ -241,9 +278,9 @@ public class PipesParsingHelper {
EMIT_SUCCESS, EMIT_SUCCESS_PARSE_EXCEPTION,
EMIT_SUCCESS_PASSBACK,
PARSE_EXCEPTION_NO_EMIT ->
Response.Status.OK;
- case TIMEOUT, OOM, CLIENT_UNAVAILABLE_WITHIN_MS ->
+ case TIMEOUT, OOM, UNSPECIFIED_CRASH, CLIENT_UNAVAILABLE_WITHIN_MS
->
Response.Status.SERVICE_UNAVAILABLE;
- case UNSPECIFIED_CRASH, FETCH_EXCEPTION, EMIT_EXCEPTION,
+ case FETCH_EXCEPTION, EMIT_EXCEPTION,
FETCHER_NOT_FOUND, EMITTER_NOT_FOUND,
FETCHER_INITIALIZATION_EXCEPTION,
EMITTER_INITIALIZATION_EXCEPTION,
FAILED_TO_INITIALIZE ->
@@ -359,16 +396,12 @@ public class PipesParsingHelper {
// Check for errors
if (result.isProcessCrash() || result.isFatal() ||
result.isInitializationFailure()) {
LOG.warn("UNPACK parse failed: {} - {}", result.status(),
result.message());
- throw new WebApplicationException(
- "Parse failed: " + result.status(),
- mapStatusToHttpResponse(result.status()));
+ throw new
WebApplicationException(buildProcessFailureResponse(result));
}
if (result.isTaskException()) {
LOG.warn("UNPACK task exception: {} - {}", result.status(),
result.message());
- throw new WebApplicationException(
- "Parse failed: " + result.message(),
- Response.Status.INTERNAL_SERVER_ERROR);
+ throw new
WebApplicationException(buildProcessFailureResponse(result));
}
// Get metadata list from result
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 87d5fe2bd3..888cfe22e5 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -214,7 +214,7 @@ public abstract class CXFTestBase {
pipesConfig.setEmitStrategy(new
EmitStrategyConfig(EmitStrategy.PASSBACK_ALL));
this.pipesParser = PipesParser.load(tikaJsonConfig, pipesConfig,
this.pipesConfigPath);
PipesParsingHelper pipesParsingHelper = new
PipesParsingHelper(this.pipesParser, pipesConfig,
- inputTempDirectory, getUnpackEmitterBasePath());
+ inputTempDirectory, getUnpackEmitterBasePath(), false);
TikaResource.init(tika, new ServerStatus(), pipesParsingHelper,
isEnableUnsecureFeatures());
} finally {
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java
index accbe372d9..0bc085274b 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerIntegrationTest.java
@@ -35,6 +35,8 @@ import java.nio.file.Paths;
import java.security.GeneralSecurityException;
import java.util.List;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.ws.rs.ProcessingException;
import jakarta.ws.rs.core.Response;
import org.apache.commons.io.IOUtils;
@@ -136,6 +138,7 @@ public class TikaServerIntegrationTest extends
IntegrationTestBase {
// Server should return 503 (Service Unavailable) for OOM, not crash
assertEquals(503, response.getStatus());
+ assertErrorResponseStatus(response, "OOM");
// Server should still be running - verify with a successful request
testBaseline();
@@ -155,6 +158,7 @@ public class TikaServerIntegrationTest extends
IntegrationTestBase {
// Server should return 503 (Service Unavailable) for OOM, not crash
assertEquals(503, response.getStatus());
+ assertErrorResponseStatus(response, "OOM");
// Server should still be running - verify with a successful request
testBaseline();
@@ -172,8 +176,9 @@ public class TikaServerIntegrationTest extends
IntegrationTestBase {
.accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_SYSTEM_EXIT));
- // Server should return 500 (Internal Server Error) for unspecified
crash
- assertEquals(500, response.getStatus());
+ // UNSPECIFIED_CRASH is a transient process failure — 503, same
category as OOM/TIMEOUT
+ assertEquals(503, response.getStatus());
+ assertErrorResponseStatus(response, "UNSPECIFIED_CRASH");
// Server should still be running - verify with a successful request
testBaseline();
@@ -191,8 +196,9 @@ public class TikaServerIntegrationTest extends
IntegrationTestBase {
.accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_SYSTEM_EXIT));
- // Server should return 500 (Internal Server Error) for unspecified
crash
- assertEquals(500, response.getStatus());
+ // UNSPECIFIED_CRASH is a transient process failure — 503, same
category as OOM/TIMEOUT
+ assertEquals(503, response.getStatus());
+ assertErrorResponseStatus(response, "UNSPECIFIED_CRASH");
// Server should still be running - verify with a successful request
testBaseline();
@@ -212,11 +218,25 @@ public class TikaServerIntegrationTest extends
IntegrationTestBase {
// Server should return 503 (Service Unavailable) for timeout
assertEquals(503, response.getStatus());
+ assertErrorResponseStatus(response, "TIMEOUT");
// Server should still be running - verify with a successful request
testBaseline();
}
+ /**
+ * Asserts that an error response body is JSON with a {@code status} field
matching
+ * {@code expectedStatus} (a {@code PipesResult.RESULT_STATUS} enum name).
+ */
+ private void assertErrorResponseStatus(Response response, String
expectedStatus) throws IOException {
+ try (InputStream is = (InputStream) response.getEntity()) {
+ String body = IOUtils.toString(is, UTF_8);
+ JsonNode node = new ObjectMapper().readTree(body);
+ assertEquals(expectedStatus, node.path("status").asText(null),
+ "Expected JSON error body with status=" + expectedStatus +
" but got: " + body);
+ }
+ }
+
private String getConfig(String configName) {
try {