This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 1112f7a bug: Support read_parquet glob file paths (#34)
1112f7a is described below
commit 1112f7a7fe2145b93418329fe81fa245d19e11ef
Author: Peter Nguyen <[email protected]>
AuthorDate: Fri Sep 5 22:09:15 2025 -0700
bug: Support read_parquet glob file paths (#34)
* Add 'test_read_parquet_local_glob' python test
* Fix glob behavior by only converting to globs once
* Clean up
---
python/sedonadb/tests/test_context.py | 14 ++++++++++++++
rust/sedona-geoparquet/src/provider.rs | 24 ++++++++++++++----------
2 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/python/sedonadb/tests/test_context.py
b/python/sedonadb/tests/test_context.py
index fe1300c..ce8b102 100644
--- a/python/sedonadb/tests/test_context.py
+++ b/python/sedonadb/tests/test_context.py
@@ -35,6 +35,20 @@ def test_read_parquet(con, geoarrow_data):
assert len(tab) == 244
+def test_read_parquet_local_glob(con, geoarrow_data):
+ # The above test uses .glob() method, this test uses the raw string
+ tab = con.read_parquet(
+ geoarrow_data / "example/files/*_geo.parquet"
+ ).to_arrow_table()
+ assert tab["geometry"].type.extension_name == "geoarrow.wkb"
+ assert len(tab) == 244
+
+ tab = con.read_parquet(
+ geoarrow_data / "example/files/example_polygon-*geo.parquet"
+ ).to_arrow_table()
+ assert len(tab) == 12
+
+
def test_read_parquet_error(con):
with pytest.raises(sedonadb._lib.SedonaError, match="No table paths were
provided"):
con.read_parquet([])
diff --git a/rust/sedona-geoparquet/src/provider.rs
b/rust/sedona-geoparquet/src/provider.rs
index 485e57e..6fcb4df 100644
--- a/rust/sedona-geoparquet/src/provider.rs
+++ b/rust/sedona-geoparquet/src/provider.rs
@@ -24,7 +24,7 @@ use datafusion::{
file_format::parquet::ParquetFormat,
listing::{ListingOptions, ListingTable, ListingTableConfig,
ListingTableUrl},
},
- execution::{context::DataFilePaths, options::ReadOptions, SessionState},
+ execution::{options::ReadOptions, SessionState},
prelude::{ParquetReadOptions, SessionConfig, SessionContext},
};
use datafusion_common::{exec_err, Result};
@@ -36,12 +36,11 @@ use crate::format::GeoParquetFormat;
/// Because [ListingTable] implements `TableProvider`, this can be used to
/// implement geo-aware Parquet reading with interfaces that are otherwise
/// hard-coded to the built-in Parquet reader.
-pub async fn geoparquet_listing_table<P: DataFilePaths>(
+pub async fn geoparquet_listing_table(
context: &SessionContext,
- table_paths: P,
+ table_paths: Vec<ListingTableUrl>,
options: GeoParquetReadOptions<'_>,
) -> Result<ListingTable> {
- let table_paths = table_paths.to_urls()?;
let session_config = context.copied_config();
let listing_options =
options.to_listing_options(&session_config,
context.copied_table_options());
@@ -134,7 +133,9 @@ mod test {
let data_dir = geoarrow_data_dir().unwrap();
let tab = geoparquet_listing_table(
&ctx,
- format!("{data_dir}/example/files/*_geo.parquet"),
+ vec![
+
ListingTableUrl::parse(format!("{data_dir}/example/files/*_geo.parquet")).unwrap(),
+ ],
GeoParquetReadOptions::default(),
)
.await
@@ -169,15 +170,18 @@ mod test {
#[tokio::test]
async fn listing_table_errors() {
let ctx = SessionContext::new();
- let err =
- geoparquet_listing_table(&ctx, Vec::<String>::new(),
GeoParquetReadOptions::default())
- .await
- .unwrap_err();
+ let err = geoparquet_listing_table(
+ &ctx,
+ Vec::<ListingTableUrl>::new(),
+ GeoParquetReadOptions::default(),
+ )
+ .await
+ .unwrap_err();
assert_eq!(err.message(), "No table paths were provided");
let err = geoparquet_listing_table(
&ctx,
- "foofy.wrongextension",
+ vec![ListingTableUrl::parse("foofy.wrongextension").unwrap()],
GeoParquetReadOptions::default(),
)
.await