This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit ccb03489c2bb620ff3715610da95612cdcc97386 Author: Pranav Toggi <[email protected]> AuthorDate: Thu Jun 26 14:14:11 2025 -0700 add parcel distribution --- tpchgen-arrow/src/building.rs | 2 +- tpchgen/src/generators.rs | 26 ++++- tpchgen/src/lib.rs | 3 +- tpchgen/src/spider.rs | 259 +++++++++++++++++++++++++++++++++--------- tpchgen/src/spider_presets.rs | 53 +++++++++ 5 files changed, 281 insertions(+), 62 deletions(-) diff --git a/tpchgen-arrow/src/building.rs b/tpchgen-arrow/src/building.rs index 34a6ccb..05821c6 100644 --- a/tpchgen-arrow/src/building.rs +++ b/tpchgen-arrow/src/building.rs @@ -59,7 +59,7 @@ impl Iterator for BuildingArrow { let buildingkey = Int64Array::from_iter_values(rows.iter().map(|r| r.b_buildingkey)); let name = string_view_array_from_display_iter(rows.iter().map(|r| &r.b_name)); - let polygon_wkt = StringViewArray::from_iter_values(rows.iter().map(|r| r.b_polygonwkt)); + let polygon_wkt = StringViewArray::from_iter_values(rows.iter().map(|r| r.b_polygonwkt.clone())); let batch = RecordBatch::try_new( Arc::clone(self.schema()), diff --git a/tpchgen/src/generators.rs b/tpchgen/src/generators.rs index 7d27dd8..a87c39e 100644 --- a/tpchgen/src/generators.rs +++ b/tpchgen/src/generators.rs @@ -14,7 +14,8 @@ use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use crate::dates::{GenerateUtils, TPCHDate}; use crate::random::{RandomBoundedInt, RandomString, RandomStringSequence, RandomText}; -use crate::spider::SpiderGenerator; +use crate::spider::{spider_seed_for_index, SpiderGenerator}; +use crate::spider_presets::SpiderPresets; /// Generator for Nation table data #[derive(Debug, Clone)] @@ -2042,7 +2043,7 @@ pub struct TripGenerator { impl TripGenerator { /// Base scale for trip generation - const SCALE_BASE: i32 = 1_500_000; + const SCALE_BASE: i32 = 500_000; // Constants for trip generation const FARE_MIN_PER_MILE: i32 = 150; // $1.50 per mile @@ -2061,7 +2062,7 @@ impl TripGenerator { Distributions::static_default(), TextPool::get_or_init_default(), crate::kde::default_distance_kde(), - SpiderGenerator::default(), + SpiderPresets::for_trip_pickups(), ) } @@ -2327,7 +2328,7 @@ pub struct Building<'a> { /// Name of the building pub b_name: StringSequenceInstance<'a>, /// WKT representation of the building's polygon - pub b_polygonwkt: &'a str, + pub b_polygonwkt: String, } impl Display for Building<'_> { @@ -2349,6 +2350,7 @@ pub struct BuildingGenerator<'a> { part_count: i32, distributions: &'a Distributions, text_pool: &'a TextPool, + spatial_gen: SpiderGenerator, } impl<'a> BuildingGenerator<'a> { @@ -2369,6 +2371,7 @@ impl<'a> BuildingGenerator<'a> { part_count, Distributions::static_default(), TextPool::get_or_init_default(), + SpiderPresets::for_building_polygons(), ) } @@ -2379,6 +2382,7 @@ impl<'a> BuildingGenerator<'a> { part_count: i32, distributions: &'b Distributions, text_pool: &'b TextPool, + spatial_gen: SpiderGenerator, ) -> BuildingGenerator<'b> { BuildingGenerator { scale_factor, @@ -2386,6 +2390,7 @@ impl<'a> BuildingGenerator<'a> { part_count, distributions, text_pool, + spatial_gen, } } @@ -2406,6 +2411,7 @@ impl<'a> BuildingGenerator<'a> { self.part_count, ), Self::calculate_row_count(self.scale_factor, self.part, self.part_count), + self.spatial_gen.clone(), ) } } @@ -2424,6 +2430,7 @@ impl<'a> IntoIterator for &'a BuildingGenerator<'a> { pub struct BuildingGeneratorIterator<'a> { name_random: RandomStringSequence<'a>, wkt_random: RandomText<'a>, + spatial_gen: SpiderGenerator, start_index: i64, row_count: i64, @@ -2436,6 +2443,7 @@ impl<'a> BuildingGeneratorIterator<'a> { text_pool: &'a TextPool, start_index: i64, row_count: i64, + spatial_gen: SpiderGenerator, ) -> Self { let mut name_random = RandomStringSequence::new( 709314158, @@ -2457,6 +2465,8 @@ impl<'a> BuildingGeneratorIterator<'a> { wkt_random, start_index, row_count, + spatial_gen, + index: 0, } } @@ -2465,10 +2475,14 @@ impl<'a> BuildingGeneratorIterator<'a> { fn make_building(&mut self, building_key: i64) -> Building<'a> { let name = self.name_random.next_value(); + let seed = spider_seed_for_index(building_key as u64, 1234); + let mut rng = StdRng::seed_from_u64(seed); + let wkt = self.spatial_gen.generate_parcel(&mut rng); + Building { b_buildingkey: building_key, b_name: name, - b_polygonwkt: self.wkt_random.next_value(), + b_polygonwkt: wkt, } } } @@ -2623,7 +2637,7 @@ mod tests { let trips: Vec<_> = generator.iter().collect(); // Should have 0.01 * 1,000,000 = 10,000 trips - assert_eq!(trips.len(), 15000); + assert_eq!(trips.len(), 5000); // Check first trip let first = &trips[0]; diff --git a/tpchgen/src/lib.rs b/tpchgen/src/lib.rs index 4fee43c..098941c 100644 --- a/tpchgen/src/lib.rs +++ b/tpchgen/src/lib.rs @@ -60,4 +60,5 @@ pub mod queries; pub mod random; pub mod text; pub mod kde; -mod spider; +pub mod spider; +pub mod spider_presets; diff --git a/tpchgen/src/spider.rs b/tpchgen/src/spider.rs index d1bf86b..7d1042b 100644 --- a/tpchgen/src/spider.rs +++ b/tpchgen/src/spider.rs @@ -2,19 +2,58 @@ use rand::{Rng, SeedableRng}; use rand::rngs::StdRng; #[derive(Debug, Clone, Copy)] -pub enum SpiderDistribution { +pub enum DistributionType { Uniform, + Normal, + Diagonal, + Sierpinski, + Bit, + Parcel, +} + +#[derive(Debug, Clone, Copy)] +pub enum GeomType { + Polygon, + Box, + Point, +} + +#[derive(Debug, Clone)] +pub struct BoxWithDepth { + pub depth: i32, + pub x: f64, + pub y: f64, + pub w: f64, + pub h: f64, +} + +#[derive(Debug, Clone)] +pub enum DistributionParams { + None, Normal { mu: f64, sigma: f64 }, Diagonal { percentage: f64, buffer: f64 }, Bit { probability: f64, digits: u32 }, - Sierpinski, + Parcel { srange: f64, dither: f64 }, } #[derive(Debug, Clone)] pub struct SpiderConfig { - pub dist: SpiderDistribution, - pub global_seed: u64, - pub affine: Option<[f64; 6]>, + pub dist_type: DistributionType, + pub geom_type: GeomType, + pub dim: i32, + pub seed: u32, + pub affine: Option<[f64; 6]>, // Affine transformation matrix + + // Box-specific fields + pub width: f64, + pub height: f64, + + // Polygon-specific fields + pub maxseg: i32, + pub polysize: f64, + + // Distribution-specific params + pub params: DistributionParams, } #[derive(Clone, Debug)] @@ -28,54 +67,164 @@ impl SpiderGenerator { } pub fn generate_point(&self, index: u64) -> (f64, f64) { - let seed = spider_seed_for_index(index, self.config.global_seed); + let seed = spider_seed_for_index(index, self.config.seed as u64); let mut rng = StdRng::seed_from_u64(seed); - match self.config.dist { - SpiderDistribution::Uniform => (rng.gen(), rng.gen()), + match self.config.dist_type { + DistributionType::Uniform => self.generate_uniform(&mut rng), + DistributionType::Normal => self.generate_normal(&mut rng), + DistributionType::Diagonal => self.generate_diagonal(&mut rng), + DistributionType::Bit => self.generate_bit(&mut rng), + DistributionType::Sierpinski => self.generate_sierpinski(&mut rng), + _ => (rng.gen(), rng.gen()) + } + + } + + fn generate_uniform(&self, rng: &mut StdRng) -> (f64, f64) { + (rand_unit(rng), rand_unit(rng)) + } + + fn generate_normal(&self, rng: &mut StdRng) -> (f64, f64) { + if let DistributionParams::Normal { mu, sigma } = self.config.params { + let x = rand_normal(rng, mu, sigma).clamp(0.0, 1.0); + let y = rand_normal(rng, mu, sigma).clamp(0.0, 1.0); + (x, y) + } else { + // Default values or error handling + (rng.gen(), rng.gen()) + } + } - SpiderDistribution::Normal { mu, sigma } => { - let x = rand_normal(&mut rng, mu, sigma).clamp(0.0, 1.0); - let y = rand_normal(&mut rng, mu, sigma).clamp(0.0, 1.0); + fn generate_diagonal(&self, rng: &mut StdRng) -> (f64, f64) { + if let DistributionParams::Diagonal { percentage, buffer } = self.config.params { + if rng.gen::<f64>() < percentage { + let v = rng.gen(); + (v, v) + } else { + let c: f64 = rng.gen(); + let d: f64 = rand_normal(rng, 0.0, buffer / 5.0); + let x: f64 = (c + d / f64::sqrt(2.0)).clamp(0.0, 1.0); + let y: f64 = (c - d / f64::sqrt(2.0)).clamp(0.0, 1.0); (x, y) } + } else { + // Default values or error handling + (rng.gen(), rng.gen()) + } + } - SpiderDistribution::Diagonal { percentage, buffer } => { - if rng.gen::<f64>() < percentage { - let v = rng.gen(); - (v, v) - } else { - let c: f64 = rng.gen(); - let d: f64 = rand_normal(&mut rng, 0.0, buffer / 5.0); - let x: f64 = (c + d / f64::sqrt(2.0)).clamp(0.0, 1.0); - let y: f64 = (c - d / f64::sqrt(2.0)).clamp(0.0, 1.0); - (x, y) - } - } + fn generate_bit(&self, rng: &mut StdRng) -> (f64, f64) { + if let DistributionParams::Bit { probability, digits } = self.config.params { + let x = spider_bit(rng, probability, digits); + let y = spider_bit(rng, probability, digits); + (x, y) + } else { + // Default values or error handling + (rng.gen(), rng.gen()) + } + } - SpiderDistribution::Bit { probability, digits } => { - let x = spider_bit(&mut rng, probability, digits); - let y = spider_bit(&mut rng, probability, digits); - (x, y) + fn generate_sierpinski(&self, rng: &mut StdRng) -> (f64, f64) { + let (mut x, mut y) = (0.0, 0.0); + let a = (0.0, 0.0); + let b = (1.0, 0.0); + let c = (0.5, (3.0f64).sqrt() / 2.0); + for _ in 0..10 { + match rng.gen_range(0..3) { + 0 => { x = (x + a.0) / 2.0; y = (y + a.1) / 2.0; } + 1 => { x = (x + b.0) / 2.0; y = (y + b.1) / 2.0; } + _ => { x = (x + c.0) / 2.0; y = (y + c.1) / 2.0; } } + } + (x, y) + } + + pub fn generate_parcel(&self, rng: &mut StdRng) -> String { + if let DistributionParams::Parcel { srange, dither } = self.config.params { + let mut box_stack = vec![BoxWithDepth { + depth: 0, + x: 0.0, + y: 0.0, + w: 1.0, + h: 1.0, + }]; + + // Pick a depth based on dim (log2) or fixed depth + let depth_limit = 6; // You can make this configurable if needed - SpiderDistribution::Sierpinski => { - let (mut x, mut y) = (0.0, 0.0); - let a = (0.0, 0.0); - let b = (1.0, 0.0); - let c = (0.5, (3.0f64).sqrt() / 2.0); - for _ in 0..10 { - match rng.gen_range(0..3) { - 0 => { x = (x + a.0) / 2.0; y = (y + a.1) / 2.0; } - 1 => { x = (x + b.0) / 2.0; y = (y + b.1) / 2.0; } - _ => { x = (x + c.0) / 2.0; y = (y + c.1) / 2.0; } - } + for _ in 0..depth_limit { + let b = box_stack.pop().unwrap(); + let (b1, b2) = if b.w > b.h { + let split = b.w * (srange + rand_unit(rng) * (1.0 - 2.0 * srange)); + ( + BoxWithDepth { depth: b.depth + 1, x: b.x, y: b.y, w: split, h: b.h }, + BoxWithDepth { depth: b.depth + 1, x: b.x + split, y: b.y, w: b.w - split, h: b.h }, + ) + } else { + let split = b.h * (srange + rand_unit(rng) * (1.0 - 2.0 * srange)); + ( + BoxWithDepth { depth: b.depth + 1, x: b.x, y: b.y, w: b.w, h: split }, + BoxWithDepth { depth: b.depth + 1, x: b.x, y: b.y + split, w: b.w, h: b.h - split }, + ) + }; + + // Randomly pick one of the two + if rng.gen_bool(0.5) { + box_stack.push(b1); + } else { + box_stack.push(b2); } - (x, y) } + + let mut b = box_stack.pop().unwrap(); + + // Apply dither + let dx = b.w * dither * (rand_unit(rng) - 0.5); + let dy = b.h * dither * (rand_unit(rng) - 0.5); + b.x += dx / 2.0; + b.y += dy / 2.0; + b.w -= dx; + b.h -= dy; + + // Pick random point inside the box + let _x = b.x + rand_unit(rng) * b.w; + let _y = b.y + rand_unit(rng) * b.h; + + self.box_to_wkt(&b) + } else { + self.box_to_wkt(&BoxWithDepth { + depth: 0, + x: 0.0, + y: 0.0, + w: 1.0, + h: 1.0, + }) } } + fn box_to_wkt(&self, b: &BoxWithDepth) -> String { + let corners = [ + (b.x, b.y), + (b.x + b.w, b.y), + (b.x + b.w, b.y + b.h), + (b.x, b.y + b.h), + (b.x, b.y), + ]; + + let affine = self.config.affine.unwrap_or([1.0, 0.0, 0.0, 0.0, 1.0, 0.0]); + + let coords: Vec<String> = corners + .iter() + .map(|&(x, y)| { + let (tx, ty) = apply_affine(x, y, &affine); + format!("{:.6} {:.6}", tx, ty) + }) + .collect(); + + format!("POLYGON (({}))", coords.join(", ")) + } + pub fn generate_pickup_point(&self, trip_id: u64) -> (f64, f64) { let (x, y) = self.generate_point(trip_id); if let Some(aff) = &self.config.affine { @@ -86,6 +235,10 @@ impl SpiderGenerator { } } +pub fn rand_unit(rng: &mut StdRng) -> f64 { + rng.gen::<f64>() // random number in [0.0, 1.0) +} + // Affine transform fn apply_affine(x: f64, y: f64, m: &[f64; 6]) -> (f64, f64) { let x_out = m[0] * x + m[1] * y + m[2]; @@ -114,18 +267,16 @@ fn spider_bit(rng: &mut StdRng, prob: f64, digits: u32) -> f64 { .sum() } -// In tpchgen/src/spider.rs - -impl Default for SpiderGenerator { - fn default() -> Self { - let config = SpiderConfig { - dist: SpiderDistribution::Uniform, - global_seed: 42, - affine: Some([ - 58.368269, 0.0, -125.244606, // scale X to 58.37°, offset to -125.24° - 0.0, 25.175375, 24.006328, // scale Y to 25.18°, offset to 24.00° - ]), - }; - SpiderGenerator::new(config) - } -} \ No newline at end of file +// impl Default for SpiderGenerator { +// fn default() -> Self { +// let config = SpiderConfig { +// dist: SpiderDistribution::Uniform, +// global_seed: 42, +// affine: Some([ +// 58.368269, 0.0, -125.244606, // scale X to 58.37°, offset to -125.24° +// 0.0, 25.175375, 24.006328, // scale Y to 25.18°, offset to 24.00° +// ]), +// }; +// SpiderGenerator::new(config) +// } +// } \ No newline at end of file diff --git a/tpchgen/src/spider_presets.rs b/tpchgen/src/spider_presets.rs new file mode 100644 index 0000000..e6827b0 --- /dev/null +++ b/tpchgen/src/spider_presets.rs @@ -0,0 +1,53 @@ +use crate::spider::{SpiderGenerator, SpiderConfig, DistributionType, DistributionParams, GeomType}; + +pub struct SpiderPresets; + +impl SpiderPresets { + pub fn for_trip_pickups() -> SpiderGenerator { + let config = SpiderConfig { + dist_type: DistributionType::Uniform, + geom_type: GeomType::Point, + dim: 2, + seed: 42, + affine: Some([ + 58.368269, 0.0, -125.244606, // scale X to 58.37°, offset to -125.24° + 0.0, 25.175375, 24.006328, // scale Y to 25.18°, offset to 24.00° + ]), + + // geometry = box + width: 0.0, + height: 0.0, + + // geometry = polygon + maxseg: 0, + polysize: 0.0, + + params: DistributionParams::None, + }; + SpiderGenerator::new(config) + } + + pub fn for_building_polygons() -> SpiderGenerator { + let config = SpiderConfig { + dist_type: DistributionType::Parcel, + geom_type: GeomType::Box, + dim: 2, + seed: 12345, + affine: Some([ + 58.368269, 0.0, -125.244606, // scale X to 58.37°, offset to -125.24° + 0.0, 25.175375, 24.006328, // scale Y to 25.18°, offset to 24.00° + ]), + + // geometry = box + width: 0.0, + height: 0.0, + + // geometry = polygon + maxseg: 0, + polysize: 0.0, + + params: DistributionParams::Parcel { srange: 0.1, dither: 2.0 }, + }; + SpiderGenerator::new(config) + } +} \ No newline at end of file
