Copilot commented on code in PR #54:
URL: 
https://github.com/apache/sedona-spatialbench/pull/54#discussion_r2445839378


##########
spatialbench-cli/src/zone_df.rs:
##########
@@ -200,29 +192,39 @@ pub async fn generate_zone_parquet(args: ZoneDfArgs) -> 
Result<()> {
     let rt: Arc<RuntimeEnv> = Arc::new(RuntimeEnvBuilder::new().build()?);
     debug!("Built DataFusion runtime environment");
 
-    // Register S3 store for Overture bucket
-    let bucket = OVERTURE_S3_BUCKET;
-    info!("Registering S3 store for bucket: {}", bucket);
-    let s3 = AmazonS3Builder::new()
-        .with_bucket_name(bucket)
-        .with_skip_signature(true)
-        .with_region("us-west-2")
-        .build()?;
-
-    let s3_url = Url::parse(&format!("s3://{bucket}"))?;
-    let s3_store: Arc<dyn ObjectStore> = Arc::new(s3);
-    rt.register_object_store(&s3_url, s3_store);
-    debug!("Successfully registered S3 object store");
+    // Register HTTPS object store for Hugging Face
+    let hf_store = HttpBuilder::new().with_url(HUGGINGFACE_URL).build()?;
+    let hf_url = Url::parse(HUGGINGFACE_URL)?;
+    rt.register_object_store(&hf_url, Arc::new(hf_store));
+    debug!("Registered HTTPS object store for huggingface.co");
 
     let ctx = SessionContext::new_with_config_rt(SessionConfig::from(cfg), rt);
     debug!("Created DataFusion session context");
 
-    let url = zones_parquet_url();
-    info!("Reading parquet data from: {}", url);
+    // Parquet parts from Hugging Face (programmatically generated)
+    const PARQUET_PART_COUNT: usize = 4;
+    const PARQUET_UUID: &str = "c998b093-fa14-440c-98f0-bbdb2126ed22";
+    let parquet_urls: Vec<String> = (0..PARQUET_PART_COUNT)
+        .map(|i| format!(
+            
"https://huggingface.co/datasets/apache-sedona/spatialbench/resolve/{}/omf-division-area-{}/part-{i:05}-{uuid}-c000.zstd.parquet";,

Review Comment:
   The format string contains redundant named parameters. The format string 
uses `{i:05}` and `{uuid}` but then explicitly names them in the arguments with 
`i = i` and `uuid = PARQUET_UUID`. Either use positional arguments `{0:05}` and 
`{1}`, or use named parameters `{i:05}` and `{uuid}` without the explicit 
naming in the arguments.



##########
spatialbench-cli/src/zone_df.rs:
##########
@@ -11,24 +11,16 @@ use datafusion::{
 use crate::plan::DEFAULT_PARQUET_ROW_GROUP_BYTES;
 use datafusion::execution::runtime_env::RuntimeEnv;
 use log::{debug, info};
-use object_store::aws::AmazonS3Builder;
-use object_store::ObjectStore;
+use object_store::http::HttpBuilder;
 use parquet::{
     arrow::ArrowWriter, basic::Compression as ParquetCompression,
     file::properties::WriterProperties,
 };
 use url::Url;
 
 const OVERTURE_RELEASE_DATE: &str = "2025-08-20.1";
-const OVERTURE_S3_BUCKET: &str = "overturemaps-us-west-2";
-const OVERTURE_S3_PREFIX: &str = "release";
-
-fn zones_parquet_url() -> String {
-    format!(
-        "s3://{}/{}/{}/theme=divisions/type=division_area/",
-        OVERTURE_S3_BUCKET, OVERTURE_S3_PREFIX, OVERTURE_RELEASE_DATE
-    )
-}
+const HUGGINGFACE_URL: &str = "https://huggingface.co";;

Review Comment:
   The hardcoded commit hash lacks context. Add a comment explaining that this 
is the Hugging Face dataset commit hash and what happens if the commit is 
updated (e.g., does PARQUET_PART_COUNT or PARQUET_UUID need to change?).
   ```suggestion
   const HUGGINGFACE_URL: &str = "https://huggingface.co";;
   // This is the Hugging Face dataset commit hash for the Overture release.
   // If the dataset is updated and the commit hash changes, you may also need 
to update
   // related constants such as PARQUET_PART_COUNT or PARQUET_UUID to match the 
new dataset structure.
   ```



##########
spatialbench-cli/src/zone_df.rs:
##########
@@ -200,29 +192,39 @@ pub async fn generate_zone_parquet(args: ZoneDfArgs) -> 
Result<()> {
     let rt: Arc<RuntimeEnv> = Arc::new(RuntimeEnvBuilder::new().build()?);
     debug!("Built DataFusion runtime environment");
 
-    // Register S3 store for Overture bucket
-    let bucket = OVERTURE_S3_BUCKET;
-    info!("Registering S3 store for bucket: {}", bucket);
-    let s3 = AmazonS3Builder::new()
-        .with_bucket_name(bucket)
-        .with_skip_signature(true)
-        .with_region("us-west-2")
-        .build()?;
-
-    let s3_url = Url::parse(&format!("s3://{bucket}"))?;
-    let s3_store: Arc<dyn ObjectStore> = Arc::new(s3);
-    rt.register_object_store(&s3_url, s3_store);
-    debug!("Successfully registered S3 object store");
+    // Register HTTPS object store for Hugging Face
+    let hf_store = HttpBuilder::new().with_url(HUGGINGFACE_URL).build()?;
+    let hf_url = Url::parse(HUGGINGFACE_URL)?;
+    rt.register_object_store(&hf_url, Arc::new(hf_store));
+    debug!("Registered HTTPS object store for huggingface.co");
 
     let ctx = SessionContext::new_with_config_rt(SessionConfig::from(cfg), rt);
     debug!("Created DataFusion session context");
 
-    let url = zones_parquet_url();
-    info!("Reading parquet data from: {}", url);
+    // Parquet parts from Hugging Face (programmatically generated)
+    const PARQUET_PART_COUNT: usize = 4;

Review Comment:
   These magic constants lack explanation. Add comments explaining what 
PARQUET_PART_COUNT represents (number of sharded parquet files?) and the 
significance of PARQUET_UUID (is this a file identifier from the Hugging Face 
dataset?).
   ```suggestion
       // Parquet parts from Hugging Face (programmatically generated)
       // Number of sharded Parquet files (parts) to read from Hugging Face for 
the dataset.
       const PARQUET_PART_COUNT: usize = 4;
       // Unique identifier for the Parquet file set, corresponding to a 
specific version/commit in the Hugging Face dataset.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to