This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch docs/pipes-updates
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 06270d3ae2d0b2f31e173f3fcbdb5376497d9432
Author: tallison <[email protected]>
AuthorDate: Mon May 11 16:17:42 2026 -0400

    azblob
---
 .../ROOT/examples/pipes-azblob-emitter.json        |   1 +
 .../ROOT/examples/pipes-azblob-fetcher.json        |   1 +
 .../ROOT/examples/pipes-azblob-iterator.json       |   1 +
 .../ROOT/examples/pipes-azblob-pipeline.json       |   1 +
 docs/modules/ROOT/nav.adoc                         |   1 +
 docs/modules/ROOT/pages/pipes/plugins/azblob.adoc  | 185 +++++++++++++++++++++
 .../tika/pipes/azblob/ConfigExamplesTest.java      | 134 +++++++++++++++
 .../resources/config-examples/az-blob-emitter.json |  14 ++
 .../resources/config-examples/az-blob-fetcher.json |  13 ++
 .../config-examples/az-blob-pipeline.json          |  45 +++++
 .../config-examples/az-blob-pipes-iterator.json    |  13 ++
 11 files changed, 409 insertions(+)

diff --git a/docs/modules/ROOT/examples/pipes-azblob-emitter.json 
b/docs/modules/ROOT/examples/pipes-azblob-emitter.json
new file mode 120000
index 0000000000..8213f434fa
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-azblob-emitter.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-azblob-fetcher.json 
b/docs/modules/ROOT/examples/pipes-azblob-fetcher.json
new file mode 120000
index 0000000000..c7d8ce2d52
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-azblob-fetcher.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-azblob-iterator.json 
b/docs/modules/ROOT/examples/pipes-azblob-iterator.json
new file mode 120000
index 0000000000..bc68d45fb0
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-azblob-iterator.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/examples/pipes-azblob-pipeline.json 
b/docs/modules/ROOT/examples/pipes-azblob-pipeline.json
new file mode 120000
index 0000000000..1e3c9dc860
--- /dev/null
+++ b/docs/modules/ROOT/examples/pipes-azblob-pipeline.json
@@ -0,0 +1 @@
+../../../../tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json
\ No newline at end of file
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index e5e2a09624..e72c1d637b 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -35,6 +35,7 @@
 *** xref:pipes/plugins/filesystem.adoc[File System]
 *** xref:pipes/plugins/s3.adoc[Amazon S3]
 *** xref:pipes/plugins/gcs.adoc[Google Cloud Storage]
+*** xref:pipes/plugins/azblob.adoc[Azure Blob Storage]
 * xref:configuration/index.adoc[Configuration]
 ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
 ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR]
diff --git a/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc 
b/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc
new file mode 100644
index 0000000000..1e462b0f70
--- /dev/null
+++ b/docs/modules/ROOT/pages/pipes/plugins/azblob.adoc
@@ -0,0 +1,185 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Azure Blob Storage Plugin
+:toc:
+:toclevels: 3
+
+The Azure Blob Storage plugin (`tika-pipes-az-blob`) provides fetcher, 
emitter, and iterator interfaces for blobs in Azure Storage containers.
+
+[cols="2,1,3"]
+|===
+|Interface |Component name |Class
+
+|Fetcher
+|`az-blob-fetcher`
+|`AZBlobFetcher`
+
+|Emitter
+|`az-blob-emitter`
+|`AZBlobEmitter`
+
+|Iterator
+|`az-blob-pipes-iterator`
+|`AZBlobPipesIterator`
+|===
+
+[#credentials]
+== Credentials
+
+All three components authenticate with a SAS (shared-access-signature) token. 
There are no other auth modes — managed identity, account keys, and AD-based 
auth are not currently exposed.
+
+* `endpoint` — base URL of the storage account, e.g., 
`https://myaccount.blob.core.windows.net`.
+* `sasToken` — the URL query-string portion of a generated SAS, without a 
leading `?`. Permissions in the token must match the operations the component 
will perform (read for fetchers/iterators, read+write for emitters).
+
+The emitter's `validate()` enforces that `sasToken`, `endpoint`, and 
`container` are all non-blank, but does not parse the SAS itself — invalid or 
expired tokens fail later when the Azure SDK makes a request.
+
+[#az-blob-fetcher]
+== Azure Blob Fetcher (`az-blob-fetcher`)
+
+Reads blobs from an Azure Storage container. The fetch key is the blob name.
+
+[source,json]
+----
+include::example$pipes-azblob-fetcher.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`endpoint`
+|_required_
+|Storage account URL.
+
+|`container`
+|_required_
+|Container name.
+
+|`sasToken`
+|_required_
+|SAS token granting read access to the container.
+
+|`spoolToTemp`
+|`true`
+|If `true`, the fetched blob is spooled to a temp file before parsing.
+
+|`extractUserMetadata`
+|`true`
+|If `true`, blob user-metadata is copied into the parsed `Metadata`.
+|===
+
+[#az-blob-emitter]
+== Azure Blob Emitter (`az-blob-emitter`)
+
+Writes parsed results to an Azure Storage container. The emit key (relative to 
`prefix`) is derived from the `FetchEmitTuple`.
+
+[source,json]
+----
+include::example$pipes-azblob-emitter.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`endpoint`
+|_required_
+|Storage account URL (validated non-blank).
+
+|`container`
+|_required_
+|Destination container name (validated non-blank).
+
+|`sasToken`
+|_required_
+|SAS token granting read+write access (validated non-blank).
+
+|`prefix`
+|_no default_
+|Optional blob-name prefix. A trailing `/` is stripped automatically.
+
+|`fileExtension`
+|`json`
+|Extension appended to each emitted blob name.
+
+|`overwriteExisting`
+|`false`
+|If `true`, an existing blob with the same name is overwritten; otherwise the 
emit fails.
+|===
+
+[#az-blob-iterator]
+== Azure Blob Iterator (`az-blob-pipes-iterator`)
+
+Lists blobs under a container/prefix and emits one `FetchEmitTuple` per blob.
+
+[source,json]
+----
+include::example$pipes-azblob-iterator.json[]
+----
+
+=== Configuration
+
+[cols="1,1,3"]
+|===
+|Field |Default |Description
+
+|`endpoint`
+|_required_
+|Storage account URL.
+
+|`container`
+|_required_
+|Container to enumerate.
+
+|`sasToken`
+|_required_
+|SAS token granting list+read access.
+
+|`prefix`
+|`""`
+|Blob-name prefix to scope the listing.
+
+|`timeoutMillis`
+|`360000`
+|Per-request timeout, in milliseconds (6 minutes by default).
+
+|`fetcherId` / `emitterId`
+|_required_
+|IDs of the fetcher and emitter to bind to each emitted tuple. See 
xref:pipes/iterators.adoc[Pipes Iterators] for the shared iterator contract.
+|===
+
+[#az-blob-pipeline]
+== Complete Pipeline Example
+
+The example below wires the Azure Blob fetcher, emitter, and iterator together 
into a container-to-container pipeline.
+
+[source,json]
+----
+include::example$pipes-azblob-pipeline.json[]
+----
+
+[#notes]
+== Notes
+
+* SAS tokens have an expiration baked in. For long-running pipelines, rotate 
the SAS or use a token that outlives the pipeline window.
+* Avoid checking real SAS tokens into source control — the strings in the 
examples above are placeholders.
+* Each component creates its own `BlobServiceClient`. The Azure SDK pools HTTP 
connections per client.
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java
new file mode 100644
index 0000000000..0a083f608a
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/java/org/apache/tika/pipes/azblob/ConfigExamplesTest.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.azblob;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.pipes.emitter.azblob.AZBlobEmitterConfig;
+import org.apache.tika.pipes.fetcher.azblob.config.AZBlobFetcherConfig;
+import org.apache.tika.pipes.iterator.azblob.AZBlobPipesIteratorConfig;
+
+/**
+ * Validates Azure Blob fetcher/emitter/iterator configuration examples used 
in documentation.
+ * <p>
+ * The JSON configuration examples are stored in {@code 
src/test/resources/config-examples/}
+ * and are included directly in the AsciiDoc documentation via the {@code 
include::} directive.
+ */
+public class ConfigExamplesTest {
+
+    private static final String EXAMPLES_DIR = "/config-examples/";
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    @TempDir
+    Path tempDir;
+
+    private String readExample(String resourceName) throws Exception {
+        try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + 
resourceName)) {
+            assertNotNull(is, "Resource not found: " + resourceName);
+            return new String(is.readAllBytes(), StandardCharsets.UTF_8);
+        }
+    }
+
+    private void loadViaTikaLoader(String resourceName) throws Exception {
+        String json = readExample(resourceName);
+        Path configFile = tempDir.resolve("tika-config.json");
+        Files.writeString(configFile, json, StandardCharsets.UTF_8);
+        TikaLoader loader = TikaLoader.load(configFile);
+        assertNotNull(loader, "TikaLoader should not be null for: " + 
resourceName);
+    }
+
+    private JsonNode innerComponent(String json, String section, String id, 
String typeName)
+            throws Exception {
+        JsonNode root = OBJECT_MAPPER.readTree(json);
+        JsonNode sectionNode = root.get(section);
+        assertNotNull(sectionNode, "Missing section: " + section);
+        JsonNode idNode = id == null ? sectionNode : sectionNode.get(id);
+        assertNotNull(idNode, "Missing id: " + id);
+        JsonNode typed = idNode.get(typeName);
+        assertNotNull(typed, "Missing type: " + typeName);
+        return typed;
+    }
+
+    @Test
+    public void testAZBlobFetcherConfig() throws Exception {
+        loadViaTikaLoader("az-blob-fetcher.json");
+
+        JsonNode inner = innerComponent(readExample("az-blob-fetcher.json"),
+                "fetchers", "azf", "az-blob-fetcher");
+        AZBlobFetcherConfig config = 
AZBlobFetcherConfig.load(inner.toString());
+        assertEquals("tika-input", config.getContainer());
+        assertEquals("https://myaccount.blob.core.windows.net";, 
config.getEndpoint());
+        assertNotNull(config.getSasToken());
+    }
+
+    @Test
+    public void testAZBlobEmitterConfig() throws Exception {
+        loadViaTikaLoader("az-blob-emitter.json");
+
+        JsonNode inner = innerComponent(readExample("az-blob-emitter.json"),
+                "emitters", "aze", "az-blob-emitter");
+        AZBlobEmitterConfig config = 
AZBlobEmitterConfig.load(inner.toString());
+        assertEquals("tika-output", config.container());
+        assertEquals("json", config.fileExtension());
+        config.validate();
+        assertEquals("results", config.getNormalizedPrefix());
+    }
+
+    @Test
+    public void testAZBlobIteratorConfig() throws Exception {
+        loadViaTikaLoader("az-blob-pipes-iterator.json");
+
+        JsonNode inner = 
innerComponent(readExample("az-blob-pipes-iterator.json"),
+                "pipes-iterator", null, "az-blob-pipes-iterator");
+        AZBlobPipesIteratorConfig config = 
AZBlobPipesIteratorConfig.load(inner.toString());
+        assertEquals("tika-input", config.getContainer());
+        assertEquals("incoming/", config.getPrefix());
+        assertEquals(360000L, config.getTimeoutMillis());
+        assertEquals("azf", config.getFetcherId());
+        assertEquals("aze", config.getEmitterId());
+    }
+
+    @Test
+    public void testAZBlobPipelineConfig() throws Exception {
+        loadViaTikaLoader("az-blob-pipeline.json");
+
+        String json = readExample("az-blob-pipeline.json");
+        AZBlobFetcherConfig fetcher = AZBlobFetcherConfig.load(
+                innerComponent(json, "fetchers", "azf", 
"az-blob-fetcher").toString());
+        AZBlobEmitterConfig emitter = AZBlobEmitterConfig.load(
+                innerComponent(json, "emitters", "aze", 
"az-blob-emitter").toString());
+        AZBlobPipesIteratorConfig iterator = AZBlobPipesIteratorConfig.load(
+                innerComponent(json, "pipes-iterator", null, 
"az-blob-pipes-iterator").toString());
+
+        emitter.validate();
+        assertEquals(fetcher.getContainer(), iterator.getContainer());
+        assertEquals("azf", iterator.getFetcherId());
+        assertEquals("aze", iterator.getEmitterId());
+    }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json
new file mode 100644
index 0000000000..9d102868c9
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-emitter.json
@@ -0,0 +1,14 @@
+{
+  "emitters": {
+    "aze": {
+      "az-blob-emitter": {
+        "endpoint": "https://myaccount.blob.core.windows.net";,
+        "container": "tika-output",
+        "sasToken": 
"sv=2024-11-04&ss=b&srt=sco&sp=rwl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED",
+        "prefix": "results/",
+        "fileExtension": "json",
+        "overwriteExisting": false
+      }
+    }
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json
new file mode 100644
index 0000000000..aebdcedf93
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-fetcher.json
@@ -0,0 +1,13 @@
+{
+  "fetchers": {
+    "azf": {
+      "az-blob-fetcher": {
+        "endpoint": "https://myaccount.blob.core.windows.net";,
+        "container": "tika-input",
+        "sasToken": 
"sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED",
+        "extractUserMetadata": true,
+        "spoolToTemp": true
+      }
+    }
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json
new file mode 100644
index 0000000000..65181a1a57
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipeline.json
@@ -0,0 +1,45 @@
+{
+  "content-handler-factory": {
+    "basic-content-handler-factory": {
+      "type": "TEXT",
+      "writeLimit": -1,
+      "throwOnWriteLimitReached": true
+    }
+  },
+  "fetchers": {
+    "azf": {
+      "az-blob-fetcher": {
+        "endpoint": "https://myaccount.blob.core.windows.net";,
+        "container": "tika-input",
+        "sasToken": 
"sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED",
+        "extractUserMetadata": true
+      }
+    }
+  },
+  "emitters": {
+    "aze": {
+      "az-blob-emitter": {
+        "endpoint": "https://myaccount.blob.core.windows.net";,
+        "container": "tika-output",
+        "sasToken": 
"sv=2024-11-04&ss=b&srt=sco&sp=rwl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED",
+        "prefix": "results/",
+        "fileExtension": "json"
+      }
+    }
+  },
+  "pipes-iterator": {
+    "az-blob-pipes-iterator": {
+      "endpoint": "https://myaccount.blob.core.windows.net";,
+      "container": "tika-input",
+      "sasToken": 
"sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED",
+      "prefix": "incoming/",
+      "fetcherId": "azf",
+      "emitterId": "aze"
+    }
+  },
+  "pipes": {
+    "parseMode": "RMETA",
+    "onParseException": "EMIT",
+    "numClients": 4
+  }
+}
diff --git 
a/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json
 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json
new file mode 100644
index 0000000000..e2875fa92f
--- /dev/null
+++ 
b/tika-pipes/tika-pipes-plugins/tika-pipes-az-blob/src/test/resources/config-examples/az-blob-pipes-iterator.json
@@ -0,0 +1,13 @@
+{
+  "pipes-iterator": {
+    "az-blob-pipes-iterator": {
+      "endpoint": "https://myaccount.blob.core.windows.net";,
+      "container": "tika-input",
+      "sasToken": 
"sv=2024-11-04&ss=b&srt=sco&sp=rl&se=2030-01-01T00:00:00Z&st=2024-01-01T00:00:00Z&spr=https&sig=REDACTED",
+      "prefix": "incoming/",
+      "timeoutMillis": 360000,
+      "fetcherId": "azf",
+      "emitterId": "aze"
+    }
+  }
+}

Reply via email to