This is an automated email from the ASF dual-hosted git repository.
fengzhang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 5633898 feat: Add options parameter to read_parquet() for S3
configuration (#44)
5633898 is described below
commit 5633898f591c88b9d1c4db8f292d40cb330bfb0e
Author: Feng Zhang <[email protected]>
AuthorDate: Wed Sep 10 10:34:46 2025 -0700
feat: Add options parameter to read_parquet() for S3 configuration (#44)
---
Cargo.lock | 172 ++++++++++-------------------
python/sedonadb/Cargo.toml | 1 +
python/sedonadb/python/sedonadb/context.py | 16 ++-
python/sedonadb/src/context.rs | 24 +++-
python/sedonadb/tests/test_context.py | 62 ++++++++++-
rust/sedona-geoparquet/src/provider.rs | 81 +++++++++++++-
rust/sedona/src/context.rs | 35 +++---
rust/sedona/src/object_storage.rs | 110 +++++++++++++-----
8 files changed, 333 insertions(+), 168 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index c630e5c..21d1617 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -124,12 +124,6 @@ version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
-[[package]]
-name = "android-tzdata"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
-
[[package]]
name = "android_system_properties"
version = "0.1.5"
@@ -581,9 +575,9 @@ dependencies = [
[[package]]
name = "aws-lc-rs"
-version = "1.13.3"
+version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c953fe1ba023e6b7730c0d4b031d06f267f23a46167dcbd40316644b10a17ba"
+checksum = "94b8ff6c09cd57b16da53641caa860168b88c172a5ee163b0288d3d6eea12786"
dependencies = [
"aws-lc-sys",
"zeroize",
@@ -591,11 +585,11 @@ dependencies = [
[[package]]
name = "aws-lc-sys"
-version = "0.30.0"
+version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff"
+checksum = "0e44d16778acaf6a9ec9899b92cebd65580b83f685446bf2e1f5d3d732f99dcd"
dependencies = [
- "bindgen 0.69.5",
+ "bindgen 0.72.1",
"cc",
"cmake",
"dunce",
@@ -932,32 +926,29 @@ dependencies = [
[[package]]
name = "bindgen"
-version = "0.69.5"
+version = "0.71.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
+checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
dependencies = [
"bitflags",
"cexpr",
"clang-sys",
- "itertools 0.12.1",
- "lazy_static",
- "lazycell",
+ "itertools 0.13.0",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
- "rustc-hash 1.1.0",
+ "rustc-hash",
"shlex",
"syn 2.0.106",
- "which",
]
[[package]]
name = "bindgen"
-version = "0.71.1"
+version = "0.72.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
dependencies = [
"bitflags",
"cexpr",
@@ -968,7 +959,7 @@ dependencies = [
"proc-macro2",
"quote",
"regex",
- "rustc-hash 2.1.1",
+ "rustc-hash",
"shlex",
"syn 2.0.106",
]
@@ -1150,15 +1141,14 @@ checksum =
"613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "chrono"
-version = "0.4.41"
+version = "0.4.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d"
+checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2"
dependencies = [
- "android-tzdata",
"iana-time-zone",
"num-traits",
"serde",
- "windows-link 0.1.3",
+ "windows-link 0.2.0",
]
[[package]]
@@ -1457,7 +1447,7 @@ dependencies = [
"crossterm_winapi",
"document-features",
"parking_lot",
- "rustix 1.0.8",
+ "rustix",
"winapi",
]
@@ -2383,12 +2373,12 @@ checksum =
"877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "errno"
-version = "0.3.13"
+version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.0",
]
[[package]]
@@ -2410,7 +2400,7 @@ source =
"registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78"
dependencies = [
"cfg-if",
- "rustix 1.0.8",
+ "rustix",
"windows-sys 0.59.0",
]
@@ -2627,7 +2617,7 @@ dependencies = [
[[package]]
name = "geo-generic-alg"
version = "0.1.0"
-source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#ebc0ca413d20c0106f832c59eebda386691c1456"
+source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#86f5f3aba769d998336a3c36ac0a3d9f5536596f"
dependencies = [
"earcutr",
"float_next_after",
@@ -2662,7 +2652,7 @@ dependencies = [
[[package]]
name = "geo-traits"
version = "0.2.0"
-source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#ebc0ca413d20c0106f832c59eebda386691c1456"
+source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#86f5f3aba769d998336a3c36ac0a3d9f5536596f"
dependencies = [
"geo-types",
]
@@ -2679,7 +2669,7 @@ dependencies = [
[[package]]
name = "geo-traits-ext"
version = "0.1.0"
-source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#ebc0ca413d20c0106f832c59eebda386691c1456"
+source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#86f5f3aba769d998336a3c36ac0a3d9f5536596f"
dependencies = [
"approx",
"geo-traits 0.2.0",
@@ -2691,7 +2681,7 @@ dependencies = [
[[package]]
name = "geo-types"
version = "0.7.16"
-source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#ebc0ca413d20c0106f832c59eebda386691c1456"
+source =
"git+https://github.com/wherobots/geo.git?branch=generic-alg#86f5f3aba769d998336a3c36ac0a3d9f5536596f"
dependencies = [
"approx",
"num-traits",
@@ -2757,7 +2747,7 @@ dependencies = [
"js-sys",
"libc",
"r-efi",
- "wasi 0.14.4+wasi-0.2.4",
+ "wasi 0.14.5+wasi-0.2.4",
"wasm-bindgen",
]
@@ -3193,9 +3183,9 @@ dependencies = [
[[package]]
name = "indexmap"
-version = "2.11.0"
+version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9"
+checksum = "206a8042aec68fa4a62e8d3f7aa4ceb508177d9324faf261e1959e495b7a1921"
dependencies = [
"equivalent",
"hashbrown 0.15.5",
@@ -3275,15 +3265,6 @@ dependencies = [
"either",
]
-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
[[package]]
name = "itertools"
version = "0.13.0"
@@ -3352,18 +3333,6 @@ dependencies = [
"wasm-bindgen",
]
-[[package]]
-name = "lazy_static"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
-
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
[[package]]
name = "lexical-core"
version = "1.0.5"
@@ -3541,15 +3510,9 @@ dependencies = [
[[package]]
name = "linux-raw-sys"
-version = "0.4.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.9.4"
+version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
+checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
[[package]]
name = "litemap"
@@ -4249,7 +4212,7 @@ dependencies = [
"pin-project-lite",
"quinn-proto",
"quinn-udp",
- "rustc-hash 2.1.1",
+ "rustc-hash",
"rustls",
"socket2",
"thiserror 2.0.16",
@@ -4269,7 +4232,7 @@ dependencies = [
"lru-slab",
"rand 0.9.2",
"ring",
- "rustc-hash 2.1.1",
+ "rustc-hash",
"rustls",
"rustls-pki-types",
"slab",
@@ -4602,12 +4565,6 @@ version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace"
-[[package]]
-name = "rustc-hash"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-
[[package]]
name = "rustc-hash"
version = "2.1.1"
@@ -4625,28 +4582,15 @@ dependencies = [
[[package]]
name = "rustix"
-version = "0.38.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
-dependencies = [
- "bitflags",
- "errno",
- "libc",
- "linux-raw-sys 0.4.15",
- "windows-sys 0.59.0",
-]
-
-[[package]]
-name = "rustix"
-version = "1.0.8"
+version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8"
+checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
dependencies = [
"bitflags",
"errno",
"libc",
- "linux-raw-sys 0.9.4",
- "windows-sys 0.60.2",
+ "linux-raw-sys",
+ "windows-sys 0.61.0",
]
[[package]]
@@ -4752,11 +4696,11 @@ dependencies = [
[[package]]
name = "schannel"
-version = "0.1.27"
+version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d"
+checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1"
dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.0",
]
[[package]]
@@ -5189,6 +5133,7 @@ dependencies = [
"pyo3",
"sedona",
"sedona-adbc",
+ "sedona-geoparquet",
"sedona-proj",
"sedona-schema",
"sedona-tg",
@@ -5500,21 +5445,21 @@ dependencies = [
[[package]]
name = "target-lexicon"
-version = "0.13.2"
+version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
+checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c"
[[package]]
name = "tempfile"
-version = "3.21.0"
+version = "3.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
+checksum = "84fa4d11fadde498443cca10fd3ac23c951f0dc59e080e9f4b93d4df4e4eea53"
dependencies = [
"fastrand",
"getrandom 0.3.3",
"once_cell",
- "rustix 1.0.8",
- "windows-sys 0.60.2",
+ "rustix",
+ "windows-sys 0.61.0",
]
[[package]]
@@ -5971,9 +5916,18 @@ checksum =
"ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
[[package]]
name = "wasi"
-version = "0.14.4+wasi-0.2.4"
+version = "0.14.5+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4494f6290a82f5fe584817a676a34b9d6763e8d9d18204009fb31dceca98fd4"
+dependencies = [
+ "wasip2",
+]
+
+[[package]]
+name = "wasip2"
+version = "1.0.0+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88a5f4a424faf49c3c2c344f166f0662341d470ea185e939657aaff130f0ec4a"
+checksum = "03fa2761397e5bd52002cd7e73110c71af2109aca4e521a9f40473fe685b0a24"
dependencies = [
"wit-bindgen",
]
@@ -6083,18 +6037,6 @@ dependencies = [
"wasm-bindgen",
]
-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix 0.38.44",
-]
-
[[package]]
name = "winapi"
version = "0.3.9"
@@ -6435,7 +6377,7 @@ source =
"registry+https://github.com/rust-lang/crates.io-index"
checksum = "af3a19837351dc82ba89f8a125e22a3c475f05aba604acc023d62b2739ae2909"
dependencies = [
"libc",
- "rustix 1.0.8",
+ "rustix",
]
[[package]]
diff --git a/python/sedonadb/Cargo.toml b/python/sedonadb/Cargo.toml
index 4e8a28a..9c3f096 100644
--- a/python/sedonadb/Cargo.toml
+++ b/python/sedonadb/Cargo.toml
@@ -41,6 +41,7 @@ futures = { workspace = true }
pyo3 = { version = "0.25.1", features = ["extension-module"] }
sedona = { path = "../../rust/sedona" }
sedona-adbc = { path = "../../rust/sedona-adbc" }
+sedona-geoparquet = { path = "../../rust/sedona-geoparquet" }
sedona-schema = { path = "../../rust/sedona-schema" }
sedona-proj = { path = "../../c/sedona-proj", default-features = false }
sedona-tg = { path = "../../c/sedona-tg" }
diff --git a/python/sedonadb/python/sedonadb/context.py
b/python/sedonadb/python/sedonadb/context.py
index 40323a1..8e50d8f 100644
--- a/python/sedonadb/python/sedonadb/context.py
+++ b/python/sedonadb/python/sedonadb/context.py
@@ -17,7 +17,7 @@
import os
import sys
from pathlib import Path
-from typing import Iterable, Literal, Union, Any
+from typing import Any, Dict, Iterable, Literal, Optional, Union
from sedonadb._lib import InternalContext, configure_proj_shared
from sedonadb.dataframe import DataFrame, _create_data_frame
@@ -106,12 +106,18 @@ class SedonaContext:
"""
self._impl.drop_view(name)
- def read_parquet(self, table_paths: Union[str, Path, Iterable[str]]) ->
DataFrame:
+ def read_parquet(
+ self,
+ table_paths: Union[str, Path, Iterable[str]],
+ options: Optional[Dict[str, Any]] = None,
+ ) -> DataFrame:
"""Create a [DataFrame][sedonadb.dataframe.DataFrame] from one or more
Parquet files
Args:
table_paths: A str, Path, or iterable of paths containing URLs to
Parquet
files.
+ options: Optional dictionary of options to pass to the Parquet
reader.
+ For S3 access, use {"aws.skip_signature": True, "aws.region":
"us-west-2"} for anonymous access to public buckets.
Examples:
@@ -124,8 +130,12 @@ class SedonaContext:
if isinstance(table_paths, (str, Path)):
table_paths = [table_paths]
+ if options is None:
+ options = {}
+
return DataFrame(
- self._impl, self._impl.read_parquet([str(path) for path in
table_paths])
+ self._impl,
+ self._impl.read_parquet([str(path) for path in table_paths],
options),
)
def sql(self, sql: str) -> DataFrame:
diff --git a/python/sedonadb/src/context.rs b/python/sedonadb/src/context.rs
index 7b66806..0e39a7c 100644
--- a/python/sedonadb/src/context.rs
+++ b/python/sedonadb/src/context.rs
@@ -14,7 +14,7 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use std::sync::Arc;
+use std::{collections::HashMap, sync::Arc};
use pyo3::prelude::*;
use sedona::context::SedonaContext;
@@ -74,11 +74,31 @@ impl InternalContext {
&self,
py: Python<'py>,
table_paths: Vec<String>,
+ options: HashMap<String, PyObject>,
) -> Result<InternalDataFrame, PySedonaError> {
+ // Convert Python options to strings, filtering out None values
+ let rust_options: HashMap<String, String> = options
+ .into_iter()
+ .filter_map(|(k, v)| {
+ if v.is_none(py) {
+ None
+ } else {
+ v.bind(py)
+ .str()
+ .and_then(|s| s.extract())
+ .map(|s: String| (k, s))
+ .ok()
+ }
+ })
+ .collect();
+
+ let geo_options =
+
sedona_geoparquet::provider::GeoParquetReadOptions::from_table_options(rust_options)
+ .map_err(|e| PySedonaError::SedonaPython(format!("Invalid
table options: {e}")))?;
let df = wait_for_future(
py,
&self.runtime,
- self.inner.read_parquet(table_paths, Default::default()),
+ self.inner.read_parquet(table_paths, geo_options),
)??;
Ok(InternalDataFrame::new(df, self.runtime.clone()))
}
diff --git a/python/sedonadb/tests/test_context.py
b/python/sedonadb/tests/test_context.py
index ce8b102..39ed012 100644
--- a/python/sedonadb/tests/test_context.py
+++ b/python/sedonadb/tests/test_context.py
@@ -63,7 +63,63 @@ def test_dynamic_object_stores():
schema = pa.schema(con.read_parquet(url))
assert schema.field("geometry").type.extension_name == "geoarrow.wkb"
- # Fresh context
+
+def test_read_parquet_options_parameter(con, geoarrow_data):
+ """Test the options parameter functionality for read_parquet()"""
+ test_file = geoarrow_data /
"quadrangles/files/quadrangles_100k_geo.parquet"
+
+ # Test 1: Backward compatibility - no options parameter
+ tab1 = con.read_parquet(test_file).to_arrow_table()
+ assert tab1["geometry"].type.extension_name == "geoarrow.wkb"
+
+ # Test 2: options=None (explicit None)
+ tab2 = con.read_parquet(test_file, options=None).to_arrow_table()
+ assert tab2["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab2) == len(tab1) # Should be identical
+
+ # Test 3: Empty options dictionary
+ tab3 = con.read_parquet(test_file, options={}).to_arrow_table()
+ assert tab3["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab3) == len(tab1) # Should be identical
+
+ # Test 4: Options with string values
+ tab4 = con.read_parquet(
+ test_file, options={"test.option": "value"}
+ ).to_arrow_table()
+ assert tab4["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab4) == len(
+ tab1
+ ) # Should be identical (option ignored but not errored)
+
+
+def test_read_geoparquet_s3_anonymous_access():
+ """Test reading from a public S3 bucket geoparquet file with anonymous
access"""
con = sedonadb.connect()
- schema = pa.schema(con.sql(f"SELECT * FROM '{url}'"))
- assert schema.field("geometry").type.extension_name == "geoarrow.wkb"
+ s3_url = "s3://wherobots-examples/data/onboarding_1/nyc_buildings.parquet"
+
+ # Use aws.skip_signature with region for anonymous access
+ tab = con.read_parquet(
+ s3_url, options={"aws.skip_signature": True, "aws.region": "us-west-2"}
+ ).to_arrow_table()
+ assert len(tab) > 0
+ assert "geom" in tab.schema.names # This dataset uses 'geom' instead of
'geometry'
+
+
+def test_read_parquet_invalid_aws_option():
+ """Test that invalid AWS options are caught and provide helpful error
messages"""
+ con = sedonadb.connect()
+ url = "s3://wherobots-examples/data/onboarding_1/nyc_buildings.parquet"
+
+ # Test with a misspelled AWS option
+ with pytest.raises(
+ sedonadb._lib.SedonaError,
+ match=r"Unknown AWS option 'aws\.skip_sig'\..*aws\.skip_signature",
+ ):
+ con.read_parquet(url, options={"aws.skip_sig": "true"})
+
+ # Test with completely unknown AWS option
+ with pytest.raises(
+ sedonadb._lib.SedonaError,
+ match="Unknown AWS option.*aws.unknown_option.*Valid options are",
+ ):
+ con.read_parquet(url, options={"aws.unknown_option": "value"})
diff --git a/rust/sedona-geoparquet/src/provider.rs
b/rust/sedona-geoparquet/src/provider.rs
index 6fcb4df..5b5e059 100644
--- a/rust/sedona-geoparquet/src/provider.rs
+++ b/rust/sedona-geoparquet/src/provider.rs
@@ -14,7 +14,7 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use std::sync::Arc;
+use std::{collections::HashMap, sync::Arc};
use arrow_schema::SchemaRef;
use async_trait::async_trait;
@@ -78,6 +78,7 @@ pub async fn geoparquet_listing_table(
#[derive(Default, Clone)]
pub struct GeoParquetReadOptions<'a> {
inner: ParquetReadOptions<'a>,
+ table_options: Option<HashMap<String, String>>,
}
impl GeoParquetReadOptions<'_> {
@@ -85,6 +86,72 @@ impl GeoParquetReadOptions<'_> {
pub fn new() -> Self {
Default::default()
}
+
+ /// Create GeoParquetReadOptions from table options HashMap
+ /// Validates that AWS options are spelled correctly to help catch user
errors
+ pub fn from_table_options(options: HashMap<String, String>) ->
Result<Self, String> {
+ // Validate AWS options to catch common misspellings
+ for key in options.keys() {
+ if key.starts_with("aws.") {
+ let common_aws_options = [
+ "aws.access_key_id",
+ "aws.secret_access_key",
+ "aws.region",
+ "aws.endpoint",
+ "aws.skip_signature",
+ "aws.nosign", // Alternative name for skip_signature
+ "aws.bucket_name",
+ "aws.use_ssl",
+ "aws.force_path_style",
+ ];
+
+ if !common_aws_options.contains(&key.as_str()) {
+ // Find potential matches for misspelled options
+ let close_matches: Vec<&str> = common_aws_options
+ .iter()
+ .filter(|&&option| {
+ // Check for similar starting patterns or
abbreviations
+ let key_start = &key[4..]; // Remove "aws." prefix
+ let option_start = &option[4..]; // Remove "aws."
prefix
+
+ // Check if the key is a prefix of the option
(abbreviation)
+ // or if they share a common prefix of at least 4
characters
+ option_start.starts_with(key_start)
+ || key_start.starts_with(option_start)
+ || (key_start.len() >= 4
+ && option_start.len() >= 4
+ && key_start[..4] == option_start[..4])
+ })
+ .cloned()
+ .collect();
+
+ if !close_matches.is_empty() {
+ return Err(format!(
+ "Unknown AWS option '{}'. Did you mean: {}?",
+ key,
+ close_matches.join(", ")
+ ));
+ } else {
+ return Err(format!(
+ "Unknown AWS option '{}'. Valid options are: {}",
+ key,
+ common_aws_options.join(", ")
+ ));
+ }
+ }
+ }
+ }
+
+ Ok(GeoParquetReadOptions {
+ inner: ParquetReadOptions::default(),
+ table_options: Some(options),
+ })
+ }
+
+ /// Get the table options
+ pub fn table_options(&self) -> Option<&HashMap<String, String>> {
+ self.table_options.as_ref()
+ }
}
#[async_trait]
@@ -92,8 +159,18 @@ impl ReadOptions<'_> for GeoParquetReadOptions<'_> {
fn to_listing_options(
&self,
config: &SessionConfig,
- table_options: TableOptions,
+ mut table_options: TableOptions,
) -> ListingOptions {
+ // Merge custom table options if provided
+ if let Some(ref custom_options) = self.table_options {
+ for (key, value) in custom_options {
+ if let Err(_e) = table_options.set(key, value) {
+ // Silently continue for now - unknown options are ignored
for compatibility
+ // The validation happens in from_table_options() method
+ }
+ }
+ }
+
let mut options = self.inner.to_listing_options(config, table_options);
if let Some(parquet_format) =
options.format.as_any().downcast_ref::<ParquetFormat>() {
options.format = Arc::new(GeoParquetFormat::new(parquet_format));
diff --git a/rust/sedona/src/context.rs b/rust/sedona/src/context.rs
index dec90d1..3ca8fc6 100644
--- a/rust/sedona/src/context.rs
+++ b/rust/sedona/src/context.rs
@@ -19,7 +19,6 @@ use std::{collections::VecDeque, sync::Arc};
use crate::exec::create_plan_from_sql;
use crate::{
catalog::DynamicObjectStoreCatalog,
- object_storage::ensure_object_store_registered,
random_geometry_provider::RandomGeometryFunction,
show::{show_batches, DisplayTableOptions},
};
@@ -220,18 +219,28 @@ impl SedonaContext {
options: GeoParquetReadOptions<'_>,
) -> Result<DataFrame> {
let urls = table_paths.to_urls()?;
- let provider =
- match geoparquet_listing_table(&self.ctx, urls.clone(),
options.clone()).await {
- Ok(provider) => provider,
- Err(e) => {
- if urls.is_empty() {
- return Err(e);
- }
-
- ensure_object_store_registered(&mut self.ctx.state(),
urls[0].as_str()).await?;
- geoparquet_listing_table(&self.ctx, urls, options).await?
- }
- };
+
+ // Pre-register object store with our custom options before creating
GeoParquetReadOptions
+ if !urls.is_empty() {
+ use
crate::object_storage::ensure_object_store_registered_with_options;
+ // Extract the table options from GeoParquetReadOptions for object
store registration
+ let table_options_map =
options.table_options().cloned().unwrap_or_default();
+
+ // TODO: Consider registering object stores per-bucket instead of
per-scheme to avoid
+ // authentication conflicts. Currently, if a user first accesses a
public S3 bucket with
+ // aws.skip_signature=true and then tries to access a private
bucket, the cached object
+ // store will still have skip_signature enabled, preventing
authentication to the private
+ // bucket. A per-bucket registration approach would solve this by
using bucket-specific
+ // cache keys like "s3://bucket-name" instead of just "s3://".
+ ensure_object_store_registered_with_options(
+ &mut self.ctx.state(),
+ urls[0].as_str(),
+ Some(&table_options_map),
+ )
+ .await?;
+ }
+
+ let provider = geoparquet_listing_table(&self.ctx, urls,
options).await?;
self.ctx.read_table(Arc::new(provider))
}
diff --git a/rust/sedona/src/object_storage.rs
b/rust/sedona/src/object_storage.rs
index 22bdfa0..f6d9065 100644
--- a/rust/sedona/src/object_storage.rs
+++ b/rust/sedona/src/object_storage.rs
@@ -52,7 +52,11 @@ use object_store::ClientOptions;
use crate::context::SedonaContext;
-pub async fn ensure_object_store_registered(state: &mut SessionState, name:
&str) -> Result<()> {
+pub async fn ensure_object_store_registered_with_options(
+ state: &mut SessionState,
+ name: &str,
+ custom_options: Option<&HashMap<String, String>>,
+) -> Result<()> {
let optimized_name = substitute_tilde(name.to_owned());
let table_url = ListingTableUrl::parse(optimized_name.as_str())?;
let scheme = table_url.scheme();
@@ -67,17 +71,38 @@ pub async fn ensure_object_store_registered(state: &mut
SessionState, name: &str
Err(_) => {
let mut builder = SessionStateBuilder::from(state.clone());
- // Register the store for this URL. Here we don't have access
- // to any command options so the only choice is to use an empty
collection
+ // Register the store for this URL with custom options if provided
match scheme {
"s3" | "oss" | "cos" => {
if let Some(table_options) = builder.table_options() {
- table_options.extensions.insert(AwsOptions::default())
+ let mut aws_options = AwsOptions::default();
+
+ // Process custom options if provided
+ if let Some(options) = custom_options {
+ for (key, value) in options {
+ if key.starts_with("aws.") {
+ let _ = aws_options.set(key, value);
+ }
+ }
+ }
+
+ table_options.extensions.insert(aws_options);
}
}
"gs" | "gcs" => {
if let Some(table_options) = builder.table_options() {
- table_options.extensions.insert(GcpOptions::default())
+ let mut gcp_options = GcpOptions::default();
+
+ // Process custom options if provided
+ if let Some(options) = custom_options {
+ for (key, value) in options {
+ if key.starts_with("gcp.") {
+ let _ = gcp_options.set(key, value);
+ }
+ }
+ }
+
+ table_options.extensions.insert(gcp_options);
}
}
_ => {}
@@ -97,6 +122,11 @@ pub async fn ensure_object_store_registered(state: &mut
SessionState, name: &str
Ok(())
}
+// Backward compatibility wrapper
+pub async fn ensure_object_store_registered(state: &mut SessionState, name:
&str) -> Result<()> {
+ ensure_object_store_registered_with_options(state, name, None).await
+}
+
pub fn substitute_tilde(cur: String) -> String {
if let Some(usr_dir_path) = home_dir() {
if let Some(usr_dir) = usr_dir_path.to_str() {
@@ -120,37 +150,51 @@ pub async fn get_s3_object_store_builder(
region,
endpoint,
allow_http,
+ skip_signature,
} = aws_options;
let bucket_name = get_bucket_name(url)?;
- let mut builder =
AmazonS3Builder::from_env().with_bucket_name(bucket_name);
- if let (Some(access_key_id), Some(secret_access_key)) = (access_key_id,
secret_access_key) {
- builder = builder
- .with_access_key_id(access_key_id)
- .with_secret_access_key(secret_access_key);
-
- if let Some(session_token) = session_token {
- builder = builder.with_token(session_token);
- }
+ let mut builder = if let Some(true) = skip_signature {
+ // For anonymous access, create builder without loading environment
credentials
+ // and immediately set skip_signature to prevent credential fetching
+ AmazonS3Builder::new()
+ .with_bucket_name(bucket_name)
+ .with_skip_signature(true)
} else {
- let config =
aws_config::defaults(BehaviorVersion::latest()).load().await;
- if let Some(region) = config.region() {
- builder = builder.with_region(region.to_string());
- }
+ // For authenticated access, load from environment as usual
+ AmazonS3Builder::from_env().with_bucket_name(bucket_name)
+ };
+
+ // Handle authenticated access (only if not skipping signature)
+ if !matches!(skip_signature, Some(true)) {
+ if let (Some(access_key_id), Some(secret_access_key)) =
(access_key_id, secret_access_key) {
+ builder = builder
+ .with_access_key_id(access_key_id)
+ .with_secret_access_key(secret_access_key);
- let credentials = config
- .credentials_provider()
- .ok_or_else(|| {
-
DataFusionError::ObjectStore(Box::new(object_store::Error::Generic {
- store: "S3",
- source: "Failed to get S3 credentials from the
environment".into(),
- }))
- })?
- .clone();
-
- let credentials = Arc::new(S3CredentialProvider { credentials });
- builder = builder.with_credentials(credentials);
+ if let Some(session_token) = session_token {
+ builder = builder.with_token(session_token);
+ }
+ } else {
+ let config =
aws_config::defaults(BehaviorVersion::latest()).load().await;
+ if let Some(region) = config.region() {
+ builder = builder.with_region(region.to_string());
+ }
+
+ let credentials = config
+ .credentials_provider()
+ .ok_or_else(|| {
+
DataFusionError::ObjectStore(Box::new(object_store::Error::Generic {
+ store: "S3",
+ source: "Failed to get S3 credentials from the
environment".into(),
+ }))
+ })?
+ .clone();
+
+ let credentials = Arc::new(S3CredentialProvider { credentials });
+ builder = builder.with_credentials(credentials);
+ }
}
if let Some(region) = region {
@@ -293,6 +337,8 @@ pub struct AwsOptions {
pub endpoint: Option<String>,
/// Allow HTTP (otherwise will always use https)
pub allow_http: Option<bool>,
+ /// Skip request signing for anonymous access
+ pub skip_signature: Option<bool>,
}
impl ExtensionOptions for AwsOptions {
@@ -330,6 +376,9 @@ impl ExtensionOptions for AwsOptions {
"allow_http" => {
self.allow_http.set(rem, value)?;
}
+ "skip_signature" | "SKIP_SIGNATURE" => {
+ self.skip_signature.set(rem, value)?;
+ }
_ => {
return config_err!("Config value \"{}\" not found on
AwsOptions", rem);
}
@@ -366,6 +415,7 @@ impl ExtensionOptions for AwsOptions {
self.region.visit(&mut v, "region", "");
self.endpoint.visit(&mut v, "endpoint", "");
self.allow_http.visit(&mut v, "allow_http", "");
+ self.skip_signature.visit(&mut v, "skip_signature", "");
v.0
}
}