This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit 2a107c32a1aa4b2043f76d30e073bbd8ca3fb304 Author: Pranav Toggi <[email protected]> AuthorDate: Fri Aug 29 00:57:22 2025 -0700 [EWT-3249] Make Zone table cardinality scale in tiered fashion (#12) * Update overture version to 2025-08-20.1 * make Zone table scale in tiered Scale factors * fix fmt * Fix PR comments --------- Co-authored-by: Jia Yu <[email protected]> --- README.md | 31 ++++++--- spatialbench-cli/src/plan.rs | 9 ++- spatialbench/src/generators.rs | 139 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 153 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 1ff86a6..85e1a00 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,27 @@ Goals: SpatialBench defines a spatial star schema with the following tables: -| Table | Type | Abbr. | Description | Spatial Attributes | Cardinality per SF | -|------------|--------------|-------|---------------------------------------------|----------------------------|----------------------| -| Trip | Fact Table | `t_` | Individual trip records | pickup & dropoff points | 6M × SF | -| Customer | Dimension | `c_` | Trip customer info | None | 30K × SF | -| Driver | Dimension | `s_` | Trip driver info | None | 500 × SF | -| Vehicle | Dimension | `v_` | Trip vehicle info | None | 100 × SF | -| Zone | Dimension | `z_` | Administrative zones | Polygon | ~867k (fixed) | -| Building | Dimension | `b_` | Building footprints | Polygon | 20K × (1 + log₂(SF)) | - -Unlike other tables in the benchmark, the Zone table does not scale with the scale factor. It is a fixed-size reference table representing administrative boundaries and is derived from the Overture Maps Divisions theme, release version 2025-06-25.0. This ensures consistency and realism for spatial join workloads such as point-in-polygon or zone-based aggregations. +| Table | Type | Abbr. | Description | Spatial Attributes | Cardinality per SF | +|------------|--------------|-------|---------------------------------------------|----------------------------|--------------------------------| +| Trip | Fact Table | `t_` | Individual trip records | pickup & dropoff points | 6M × SF | +| Customer | Dimension | `c_` | Trip customer info | None | 30K × SF | +| Driver | Dimension | `s_` | Trip driver info | None | 500 × SF | +| Vehicle | Dimension | `v_` | Trip vehicle info | None | 100 × SF | +| Zone | Dimension | `z_` | Administrative zones (SF-aware scaling) | Polygon | Tiered by SF range (see below) | +| Building | Dimension | `b_` | Building footprints | Polygon | 20K × (1 + log₂(SF)) | + +### Zone Table Scaling + +The Zone table uses **scale factor–aware generation** so that zone granularity scales with dataset size and keeps query cost realistic. At small scales, this feels like querying ZIP-level units; at large scales, it uses coarser administrative units. + +| Scale Factor (SF) | Zone Subtypes Included | Zone Cardinality | +|-------------------|----------------------------------------------------|------------------| +| [0, 10) | microhood, macrohood | 117,416 | +| [10, 100) | + neighborhood, county | 455,711 | +| [100, 1000) | + localadmin, locality, region, dependency | 1,035,371 | +| [1000+) | + country | 1,035,749 | + +This tiered scaling reflects **geometry complexity** and **area distributions** observed in the Overture `division_area` dataset which represents administrative boundaries, release version 2025-08-20.1.  diff --git a/spatialbench-cli/src/plan.rs b/spatialbench-cli/src/plan.rs index e28b1f0..89df572 100644 --- a/spatialbench-cli/src/plan.rs +++ b/spatialbench-cli/src/plan.rs @@ -230,7 +230,14 @@ impl OutputSize { Table::Customer => 87, Table::Trip => 69, Table::Building => 109, - Table::Zone => 4258, + Table::Zone => { + // Scale based on zone subtype count for the scale factor + match scale_factor { + sf if sf < 10.0 => 1332, + sf if sf < 100.0 => 2000, + _ => 4258, + } + } }, }; diff --git a/spatialbench/src/generators.rs b/spatialbench/src/generators.rs index a676fd1..02e635d 100644 --- a/spatialbench/src/generators.rs +++ b/spatialbench/src/generators.rs @@ -1433,9 +1433,8 @@ pub struct ZoneGenerator { } impl ZoneGenerator { - const SCALE_BASE: i32 = 867_102; /// S3 URL for the zones parquet file - const OVERTURE_RELEASE_DATE: &'static str = "2025-06-25.0"; + const OVERTURE_RELEASE_DATE: &'static str = "2025-08-20.1"; const OVERTURE_S3_BUCKET: &'static str = "overturemaps-us-west-2"; const OVERTURE_S3_PREFIX: &'static str = "release"; @@ -1449,6 +1448,54 @@ impl ZoneGenerator { ) } + /// Get zone subtypes based on scale factor + fn get_zone_subtypes_for_scale_factor(scale_factor: f64) -> Vec<&'static str> { + let mut subtypes = vec!["microhood", "macrohood"]; + + if scale_factor >= 10.0 { + subtypes.extend_from_slice(&["neighborhood", "county"]); + } + + if scale_factor >= 100.0 { + subtypes.extend_from_slice(&["localadmin", "locality", "region", "dependency"]); + } + + if scale_factor >= 1000.0 { + subtypes.push("country"); + } + + subtypes + } + + /// Calculate total zones for a given scale factor based on subtype counts + fn calculate_total_zones_for_scale_factor(scale_factor: f64) -> i64 { + let subtypes = Self::get_zone_subtypes_for_scale_factor(scale_factor); + let mut total = 0i64; + + for subtype in subtypes { + let count = match subtype { + "microhood" => 74797, + "macrohood" => 42619, + "neighborhood" => 298615, + "county" => 39680, + "localadmin" => 19007, + "locality" => 555834, + "region" => 4714, + "dependency" => 105, + "country" => 378, + _ => 0, + }; + total += count; + } + + // Scale down for testing purposes + if scale_factor < 1.0 { + total = (total as f64 * scale_factor).ceil() as i64; + } + + total + } + /// Create a new zone generator with streaming approach pub fn new(scale_factor: f64, part: i32, part_count: i32) -> Self { let start = Instant::now(); @@ -1468,7 +1515,7 @@ impl ZoneGenerator { /// Calculate zones per partition fn calculate_zones_per_part(&self) -> i64 { - let total_zones = (self.scale_factor * Self::SCALE_BASE as f64).ceil() as i64; + let total_zones = Self::calculate_total_zones_for_scale_factor(self.scale_factor); (total_zones as f64 / self.part_count as f64).ceil() as i64 } @@ -1506,12 +1553,31 @@ impl ZoneGenerator { let zones_per_part = self.calculate_zones_per_part(); let offset = self.calculate_offset(); let zones_url = Self::get_zones_parquet_url(); + let subtypes = Self::get_zone_subtypes_for_scale_factor(self.scale_factor); info!( - "Partition {}: LIMIT {} OFFSET {} from {}", - self.part, zones_per_part, offset, zones_url + "Partition {}: LIMIT {} OFFSET {} from {} with subtypes: {:?}", + self.part, zones_per_part, offset, zones_url, subtypes ); + // Build the subtype filter + let subtype_filter = if subtypes.is_empty() { + return Err(format!( + "No subtypes found for scale factor {} in partition {}. This indicates a logic error.", + self.scale_factor, + self.part + ).into()); + } else { + format!( + "subtype IN ({})", + subtypes + .iter() + .map(|s| format!("'{}'", s)) + .collect::<Vec<_>>() + .join(", ") + ) + }; + let query = format!( "SELECT id as z_gersid, @@ -1521,9 +1587,9 @@ impl ZoneGenerator { subtype as z_subtype, ST_AsWKB(geometry) as z_boundary FROM read_parquet('{}', hive_partitioning=1) - WHERE subtype IN ('localadmin', 'locality', 'neighborhood') + WHERE {} LIMIT {} OFFSET {};", - zones_url, zones_per_part, offset + zones_url, subtype_filter, zones_per_part, offset ); debug!("Generated partition query: {}", query); @@ -1561,11 +1627,7 @@ impl ZoneGenerator { }); if zones.len() % 1000 == 0 { - debug!( - "Processed {} rows for partition {}...", - zones.len(), - self.part - ); + debug!("Loaded {} zones for partition {}", zones.len(), self.part); } zone_id += 1; } @@ -1583,7 +1645,7 @@ impl ZoneGenerator { /// Return the row count for the given part pub fn calculate_row_count(&self) -> i64 { - let total_zones = (self.scale_factor * Self::SCALE_BASE as f64).ceil() as i64; + let total_zones = Self::calculate_total_zones_for_scale_factor(self.scale_factor); let zones_per_part = self.calculate_zones_per_part(); let offset = self.calculate_offset(); @@ -1787,14 +1849,61 @@ mod tests { let generator = ZoneGenerator::new(0.001, 1, 1); let zones: Vec<_> = generator.into_iter().collect(); - assert_eq!(zones.len(), 868); + assert_eq!(zones.len(), 118); // Check first Driver let first = &zones[0]; assert_eq!(first.z_zonekey, 1); assert_eq!( first.to_string(), - "1|b40981d8-1a8b-4b30-bbdc-2a2d941bfa4f|PF||Anapoto|locality|POLYGON((-152.8059003 -22.6387783,-152.8063121 -22.6353325,-152.8063274 -22.6352309,-152.8064935 -22.6352445,-152.806615 -22.6352496,-152.8068727 -22.6352603,-152.8070173 -22.6352663,-152.8072428 -22.6352461,-152.8073888 -22.6352422,-152.8075809 -22.6352564,-152.8076508 -22.6352615,-152.8080525 -22.6353115,-152.8082102 -22.6353388,-152.8083864 -22.6353691,-152.8087408 -22.635439,-152.8089964 -22.6354851,-152.809157 [...] + "1|635d3a50-3055-44a6-8968-7e7d65dd3f61|WF|WF-UV|Place Sagato-Soane|microhood|POLYGON((-176.1735809 -13.28369,-176.1737479 -13.283821,-176.1738536 -13.2838989,-176.173536 -13.2842404,-176.1725987 -13.2833717,-176.1725033 -13.2833872,-176.1724121 -13.2833876,-176.1723319 -13.283372,-176.1722686 -13.2833485,-176.1720379 -13.283278,-176.172337 -13.2830551,-176.17235 -13.2830455,-176.1724748 -13.283002,-176.1725888 -13.2829915,-176.1727488 -13.2830245,-176.1728399 -13.2830431,-17 [...] ) } + + #[test] + fn test_zone_subtype_filters() { + // Test scale factor 0-10: should only include microhood and macrohood + let subtypes_0_10 = ZoneGenerator::get_zone_subtypes_for_scale_factor(5.0); + assert_eq!(subtypes_0_10, vec!["microhood", "macrohood"]); + + // Test scale factor 10-100: should include microhood, macrohood, neighborhood, county + let subtypes_10_100 = ZoneGenerator::get_zone_subtypes_for_scale_factor(50.0); + assert_eq!( + subtypes_10_100, + vec!["microhood", "macrohood", "neighborhood", "county"] + ); + + // Test scale factor 100-1000: should include all except country + let subtypes_100_1000 = ZoneGenerator::get_zone_subtypes_for_scale_factor(500.0); + assert_eq!( + subtypes_100_1000, + vec![ + "microhood", + "macrohood", + "neighborhood", + "county", + "localadmin", + "locality", + "region", + "dependency" + ] + ); + + // Test scale factor 1000+: should include all subtypes + let subtypes_1000_plus = ZoneGenerator::get_zone_subtypes_for_scale_factor(2000.0); + assert_eq!( + subtypes_1000_plus, + vec![ + "microhood", + "macrohood", + "neighborhood", + "county", + "localadmin", + "locality", + "region", + "dependency", + "country" + ] + ); + } }
