This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch docs/pipes-updates
in repository https://gitbox.apache.org/repos/asf/tika.git

commit cbc65c9bb88b85763660bb6d556a3dd87e5601d9
Author: tallison <[email protected]>
AuthorDate: Mon May 11 16:11:28 2026 -0400

    gcs
---
 docs/modules/ROOT/examples/pipes-gcs-emitter.json  |   1 +
 docs/modules/ROOT/examples/pipes-gcs-fetcher.json  |   1 +
 docs/modules/ROOT/examples/pipes-gcs-iterator.json |   1 +
 docs/modules/ROOT/examples/pipes-gcs-pipeline.json |   1 +
 docs/modules/ROOT/nav.adoc                         |   1 +
 docs/modules/ROOT/pages/pipes/plugins/gcs.adoc     | 166 +++++++++++++++++++++
 .../apache/tika/pipes/gcs/ConfigExamplesTest.java  | 133 +++++++++++++++++
 .../resources/config-examples/gcs-emitter.json     |  12 ++
 .../resources/config-examples/gcs-fetcher.json     |  12 ++
 .../resources/config-examples/gcs-pipeline.json    |  42 ++++++
 .../config-examples/gcs-pipes-iterator.json        |  11 ++
 11 files changed, 381 insertions(+)

diff --git a/docs/modules/ROOT/examples/pipes-gcs-emitter.json 
b/docs/modules/ROOT/examples/pipes-gcs-emitter.json
new file mode 120000
index 0000000000..48c994f74a
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-gcs-emitter.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-gcs-fetcher.json 
b/docs/modules/ROOT/examples/pipes-gcs-fetcher.json
new file mode 120000
index 0000000000..8b390e310c
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-gcs-fetcher.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-gcs-iterator.json 
b/docs/modules/ROOT/examples/pipes-gcs-iterator.json
new file mode 120000
index 0000000000..d4f6b6b934
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-gcs-iterator.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-gcs-pipeline.json 
b/docs/modules/ROOT/examples/pipes-gcs-pipeline.json
new file mode 120000
index 0000000000..621bad767e
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-gcs-pipeline.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 90fce8701d..e5e2a09624 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -34,6 +34,7 @@
 ** xref:pipes/plugins/index.adoc[Plugins]
 *** xref:pipes/plugins/filesystem.adoc[File System]
 *** xref:pipes/plugins/s3.adoc[Amazon S3]
+*** xref:pipes/plugins/gcs.adoc[Google Cloud Storage]
 * xref:configuration/index.adoc[Configuration]
 ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
 ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR]
diff --git a/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc 
b/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc
new file mode 100644
index 0000000000..d639580d0f
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/plugins/gcs.adoc
@@ -0,0 +1,166 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Google Cloud Storage Plugin
+:toc:
+:toclevels: 3
+
+The Google Cloud Storage plugin (`tika-pipes-gcs`) provides fetcher, emitter, 
and iterator interfaces for objects in GCS buckets.
+
+[cols="2,1,3"]
+|===
+|Interface |Component name |Class
+
+|Fetcher
+|`gcs-fetcher`
+|`GCSFetcher`
+
+|Emitter
+|`gcs-emitter`
+|`GCSEmitter`
+
+|Iterator
+|`gcs-pipes-iterator`
+|`GCSPipesIterator`
+|===
+
+[#credentials]
+== Credentials
+
+The GCS plugin relies on Google's Application Default Credentials chain — 
there are no credential fields in the JSON config itself. Set credentials by:
+
+* Running on a GCP service (GCE/GKE/Cloud Run) — uses the attached service 
account automatically.
+* Setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the 
path of a service-account JSON key.
+* Running `gcloud auth application-default login` for local development.
+
+The `projectId` field in each component selects which GCP project to bill the 
API calls against; the service account or user must have storage access to the 
named bucket.
+
+[#gcs-fetcher]
+== GCS Fetcher (`gcs-fetcher`)
+
+Reads objects from a GCS bucket. The fetch key is the object name.
+
+[source,json]
+----
+include::example$pipes-gcs-fetcher.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`projectId`
+|_required_
+|GCP project ID for billing/authentication.
+
+|`bucket`
+|_required_
+|GCS bucket name.
+
+|`spoolToTemp`
+|`true`
+|If `true`, the fetched object is spooled to a temp file before parsing.
+
+|`extractUserMetadata`
+|`true`
+|If `true`, GCS custom metadata is copied into the parsed `Metadata`.
+|===
+
+[#gcs-emitter]
+== GCS Emitter (`gcs-emitter`)
+
+Writes parsed results to a GCS bucket. The emit key (relative to `prefix`) is 
derived from the `FetchEmitTuple`.
+
+[source,json]
+----
+include::example$pipes-gcs-emitter.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`projectId`
+|_required_
+|GCP project ID (validated non-blank).
+
+|`bucket`
+|_required_
+|Destination GCS bucket (validated non-blank).
+
+|`prefix`
+|_no default_
+|Optional object-name prefix. A trailing `/` is stripped automatically.
+
+|`fileExtension`
+|`json`
+|Extension appended to each emitted object name.
+|===
+
+[#gcs-iterator]
+== GCS Iterator (`gcs-pipes-iterator`)
+
+Lists objects under a bucket/prefix and emits one `FetchEmitTuple` per object.
+
+[source,json]
+----
+include::example$pipes-gcs-iterator.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`bucket`
+|_required_
+|GCS bucket to enumerate.
+
+|`projectId`
+|`""`
+|GCP project ID for the listing API call.
+
+|`prefix`
+|`""`
+|Object-name prefix to scope the listing.
+
+|`fetcherId` / `emitterId`
+|_required_
+|IDs of the fetcher and emitter to bind to each emitted tuple. See 
xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract.
+|===
+
+[#gcs-pipeline]
+== Complete Pipeline Example
+
+The example below wires the GCS fetcher, emitter, and iterator together for a 
bucket-to-bucket pipeline.
+
+[source,json]
+----
+include::example$pipes-gcs-pipeline.json[]
+----
+
+[#notes]
+== Notes
+
+* The GCS plugin uses the official `google-cloud-storage` SDK. Set 
`GOOGLE_APPLICATION_CREDENTIALS` (or rely on workload identity / metadata 
server) to authenticate.
+* Each component creates its own `Storage` client. Heavy throughput should be 
balanced against your project's per-second request quota.
+* Unlike S3, there is no `path-style` toggle — GCS uses a single global 
endpoint.
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java
new file mode 100644
index 0000000000..7cfc1f3fb1
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/java/org/apache/tika/pipes/gcs/ConfigExamplesTest.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.gcs;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.pipes.emitter.gcs.GCSEmitterConfig;
+import org.apache.tika.pipes.fetcher.gcs.config.GCSFetcherConfig;
+import org.apache.tika.pipes.iterator.gcs.GCSPipesIteratorConfig;
+
+/**
+ * Validates GCS fetcher/emitter/iterator configuration examples used in 
documentation.
+ * <p>
+ * The JSON configuration examples are stored in {@code 
src/test/resources/config-examples/}
+ * and are included directly in the AsciiDoc documentation via the {@code 
include::} directive.
+ */
+public class ConfigExamplesTest {
+
+    private static final String EXAMPLES_DIR = "/config-examples/";
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    @TempDir
+    Path tempDir;
+
+    private String readExample(String resourceName) throws Exception {
+        try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + 
resourceName)) {
+            assertNotNull(is, "Resource not found: " + resourceName);
+            return new String(is.readAllBytes(), StandardCharsets.UTF_8);
+        }
+    }
+
+    private void loadViaTikaLoader(String resourceName) throws Exception {
+        String json = readExample(resourceName);
+        Path configFile = tempDir.resolve("tika-config.json");
+        Files.writeString(configFile, json, StandardCharsets.UTF_8);
+        TikaLoader loader = TikaLoader.load(configFile);
+        assertNotNull(loader, "TikaLoader should not be null for: " + 
resourceName);
+    }
+
+    private JsonNode innerComponent(String json, String section, String id, 
String typeName)
+            throws Exception {
+        JsonNode root = OBJECT_MAPPER.readTree(json);
+        JsonNode sectionNode = root.get(section);
+        assertNotNull(sectionNode, "Missing section: " + section);
+        JsonNode idNode = id == null ? sectionNode : sectionNode.get(id);
+        assertNotNull(idNode, "Missing id: " + id);
+        JsonNode typed = idNode.get(typeName);
+        assertNotNull(typed, "Missing type: " + typeName);
+        return typed;
+    }
+
+    @Test
+    public void testGCSFetcherConfig() throws Exception {
+        loadViaTikaLoader("gcs-fetcher.json");
+
+        JsonNode inner = innerComponent(readExample("gcs-fetcher.json"),
+                "fetchers", "gcsf", "gcs-fetcher");
+        GCSFetcherConfig config = GCSFetcherConfig.load(inner.toString());
+        assertEquals("my-gcp-project", config.getProjectId());
+        assertEquals("my-tika-input", config.getBucket());
+    }
+
+    @Test
+    public void testGCSEmitterConfig() throws Exception {
+        loadViaTikaLoader("gcs-emitter.json");
+
+        JsonNode inner = innerComponent(readExample("gcs-emitter.json"),
+                "emitters", "gcse", "gcs-emitter");
+        GCSEmitterConfig config = GCSEmitterConfig.load(inner.toString());
+        assertEquals("my-gcp-project", config.projectId());
+        assertEquals("my-tika-output", config.bucket());
+        assertEquals("json", config.fileExtension());
+        config.validate();
+        assertEquals("results", config.getNormalizedPrefix());
+    }
+
+    @Test
+    public void testGCSIteratorConfig() throws Exception {
+        loadViaTikaLoader("gcs-pipes-iterator.json");
+
+        JsonNode inner = innerComponent(readExample("gcs-pipes-iterator.json"),
+                "pipes-iterator", null, "gcs-pipes-iterator");
+        GCSPipesIteratorConfig config = 
GCSPipesIteratorConfig.load(inner.toString());
+        assertEquals("my-gcp-project", config.getProjectId());
+        assertEquals("my-tika-input", config.getBucket());
+        assertEquals("gcsf", config.getFetcherId());
+        assertEquals("gcse", config.getEmitterId());
+    }
+
+    @Test
+    public void testGCSPipelineConfig() throws Exception {
+        loadViaTikaLoader("gcs-pipeline.json");
+
+        String json = readExample("gcs-pipeline.json");
+        GCSFetcherConfig fetcher = GCSFetcherConfig.load(
+                innerComponent(json, "fetchers", "gcsf", 
"gcs-fetcher").toString());
+        GCSEmitterConfig emitter = GCSEmitterConfig.load(
+                innerComponent(json, "emitters", "gcse", 
"gcs-emitter").toString());
+        GCSPipesIteratorConfig iterator = GCSPipesIteratorConfig.load(
+                innerComponent(json, "pipes-iterator", null, 
"gcs-pipes-iterator").toString());
+
+        emitter.validate();
+        assertEquals(fetcher.getBucket(), iterator.getBucket());
+        assertEquals("gcsf", iterator.getFetcherId());
+        assertEquals("gcse", iterator.getEmitterId());
+    }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json
new file mode 100644
index 0000000000..6ba0603792
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-emitter.json
@@ -0,0 +1,12 @@
+{
+  "emitters": {
+    "gcse": {
+      "gcs-emitter": {
+        "projectId": "my-gcp-project",
+        "bucket": "my-tika-output",
+        "prefix": "results/",
+        "fileExtension": "json"
+      }
+    }
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json
new file mode 100644
index 0000000000..89ab85eed3
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-fetcher.json
@@ -0,0 +1,12 @@
+{
+  "fetchers": {
+    "gcsf": {
+      "gcs-fetcher": {
+        "projectId": "my-gcp-project",
+        "bucket": "my-tika-input",
+        "extractUserMetadata": true,
+        "spoolToTemp": true
+      }
+    }
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json
new file mode 100644
index 0000000000..8c483e5104
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipeline.json
@@ -0,0 +1,42 @@
+{
+  "content-handler-factory": {
+    "basic-content-handler-factory": {
+      "type": "TEXT",
+      "writeLimit": -1,
+      "throwOnWriteLimitReached": true
+    }
+  },
+  "fetchers": {
+    "gcsf": {
+      "gcs-fetcher": {
+        "projectId": "my-gcp-project",
+        "bucket": "my-tika-input",
+        "extractUserMetadata": true
+      }
+    }
+  },
+  "emitters": {
+    "gcse": {
+      "gcs-emitter": {
+        "projectId": "my-gcp-project",
+        "bucket": "my-tika-output",
+        "prefix": "results/",
+        "fileExtension": "json"
+      }
+    }
+  },
+  "pipes-iterator": {
+    "gcs-pipes-iterator": {
+      "projectId": "my-gcp-project",
+      "bucket": "my-tika-input",
+      "prefix": "incoming/",
+      "fetcherId": "gcsf",
+      "emitterId": "gcse"
+    }
+  },
+  "pipes": {
+    "parseMode": "RMETA",
+    "onParseException": "EMIT",
+    "numClients": 4
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json
new file mode 100644
index 0000000000..756e087848
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-gcs/src/test/resources/config-examples/gcs-pipes-iterator.json
@@ -0,0 +1,11 @@
+{
+  "pipes-iterator": {
+    "gcs-pipes-iterator": {
+      "projectId": "my-gcp-project",
+      "bucket": "my-tika-input",
+      "prefix": "incoming/",
+      "fetcherId": "gcsf",
+      "emitterId": "gcse"
+    }
+  }
+}

Reply via email to