This is an automated email from the ASF dual-hosted git repository.

prantogg pushed a commit to branch use-huggingface-source
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git

commit a1b9fa76281778da5246477ad3579d88eb1ba113
Author: Pranav Toggi <[email protected]>
AuthorDate: Fri Oct 17 15:15:05 2025 -0700

    load from HF over https
---
 spatialbench-cli/Cargo.toml     |  2 +-
 spatialbench-cli/src/zone_df.rs | 52 +++++++++++++++++++----------------------
 2 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/spatialbench-cli/Cargo.toml b/spatialbench-cli/Cargo.toml
index 1180f39..3f2b809 100644
--- a/spatialbench-cli/Cargo.toml
+++ b/spatialbench-cli/Cargo.toml
@@ -24,7 +24,7 @@ serde = { version = "1.0.219", features = ["derive"] }
 anyhow = "1.0.99"
 serde_yaml = "0.9.33"
 datafusion = "47.0.0"
-object_store = { version = "0.12.4", features = ["aws"] }
+object_store = { version = "0.12.4", features = ["http"] }
 arrow-array = "55.2.0"
 arrow-schema = "55.2.0"
 url = "2.5.7"
diff --git a/spatialbench-cli/src/zone_df.rs b/spatialbench-cli/src/zone_df.rs
index 464b61d..b486cb0 100644
--- a/spatialbench-cli/src/zone_df.rs
+++ b/spatialbench-cli/src/zone_df.rs
@@ -11,8 +11,7 @@ use datafusion::{
 use crate::plan::DEFAULT_PARQUET_ROW_GROUP_BYTES;
 use datafusion::execution::runtime_env::RuntimeEnv;
 use log::{debug, info};
-use object_store::aws::AmazonS3Builder;
-use object_store::ObjectStore;
+use object_store::http::HttpBuilder;
 use parquet::{
     arrow::ArrowWriter, basic::Compression as ParquetCompression,
     file::properties::WriterProperties,
@@ -20,15 +19,7 @@ use parquet::{
 use url::Url;
 
 const OVERTURE_RELEASE_DATE: &str = "2025-08-20.1";
-const OVERTURE_S3_BUCKET: &str = "overturemaps-us-west-2";
-const OVERTURE_S3_PREFIX: &str = "release";
-
-fn zones_parquet_url() -> String {
-    format!(
-        "s3://{}/{}/{}/theme=divisions/type=division_area/",
-        OVERTURE_S3_BUCKET, OVERTURE_S3_PREFIX, OVERTURE_RELEASE_DATE
-    )
-}
+const HUGGINGFACE_URL: &str = "https://huggingface.co";;
 
 fn subtypes_for_scale_factor(sf: f64) -> Vec<&'static str> {
     let mut v = vec!["microhood", "macrohood", "county"];
@@ -200,29 +191,34 @@ pub async fn generate_zone_parquet(args: ZoneDfArgs) -> 
Result<()> {
     let rt: Arc<RuntimeEnv> = Arc::new(RuntimeEnvBuilder::new().build()?);
     debug!("Built DataFusion runtime environment");
 
-    // Register S3 store for Overture bucket
-    let bucket = OVERTURE_S3_BUCKET;
-    info!("Registering S3 store for bucket: {}", bucket);
-    let s3 = AmazonS3Builder::new()
-        .with_bucket_name(bucket)
-        .with_skip_signature(true)
-        .with_region("us-west-2")
-        .build()?;
-
-    let s3_url = Url::parse(&format!("s3://{bucket}"))?;
-    let s3_store: Arc<dyn ObjectStore> = Arc::new(s3);
-    rt.register_object_store(&s3_url, s3_store);
-    debug!("Successfully registered S3 object store");
+    // Register HTTPS object store for Hugging Face
+    let hf_store = HttpBuilder::new().with_url(HUGGINGFACE_URL).build()?;
+    let hf_url = Url::parse(HUGGINGFACE_URL)?;
+    rt.register_object_store(&hf_url, Arc::new(hf_store));
+    debug!("Registered HTTPS object store for huggingface.co");
 
     let ctx = SessionContext::new_with_config_rt(SessionConfig::from(cfg), rt);
     debug!("Created DataFusion session context");
 
-    let url = zones_parquet_url();
-    info!("Reading parquet data from: {}", url);
+    // 4 Parquet parts from Hugging Face
+    let parquet_urls = vec![
+        
format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00000-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet";,
 OVERTURE_RELEASE_DATE),
+        
format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00001-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet";,
 OVERTURE_RELEASE_DATE),
+        
format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00002-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet";,
 OVERTURE_RELEASE_DATE),
+        
format!("https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/main/omf-division-area-{}/part-00003-c998b093-fa14-440c-98f0-bbdb2126ed22-c000.zstd.parquet";,
 OVERTURE_RELEASE_DATE),
+    ];
+
+    info!(
+        "Reading {} Parquet parts from Hugging Face…",
+        parquet_urls.len()
+    );
+
     let t_read_start = Instant::now();
-    let mut df = ctx.read_parquet(url, ParquetReadOptions::default()).await?;
+    let mut df = ctx
+        .read_parquet(parquet_urls, ParquetReadOptions::default())
+        .await?;
     let read_dur = t_read_start.elapsed();
-    info!("Successfully read parquet data in {:?}", read_dur);
+    info!("Successfully read HF parquet data in {:?}", read_dur);
 
     // Build filter predicate
     debug!("Building filter predicate for subtypes: {:?}", subtypes);

Reply via email to