This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit b5c7c572798c773e91af715ca0ecb4978dffd3cd Author: Pranav Toggi <[email protected]> AuthorDate: Tue Jun 24 22:46:48 2025 -0700 add spider implementation for Trip table --- tpchgen-arrow/src/customer.rs | 2 +- tpchgen-arrow/src/driver.rs | 2 +- tpchgen-arrow/src/trip.rs | 14 ++++- tpchgen-arrow/src/vehicle.rs | 4 +- tpchgen/src/csv.rs | 6 +- tpchgen/src/generators.rs | 59 +++++++++++++------ tpchgen/src/lib.rs | 1 + tpchgen/src/spider.rs | 131 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 193 insertions(+), 26 deletions(-) diff --git a/tpchgen-arrow/src/customer.rs b/tpchgen-arrow/src/customer.rs index 2009b3f..283a0a1 100644 --- a/tpchgen-arrow/src/customer.rs +++ b/tpchgen-arrow/src/customer.rs @@ -1,4 +1,4 @@ -use crate::conversions::{decimal128_array_from_iter, string_view_array_from_display_iter}; +use crate::conversions::{string_view_array_from_display_iter}; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; use arrow::array::{Int64Array, RecordBatch, StringViewArray}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; diff --git a/tpchgen-arrow/src/driver.rs b/tpchgen-arrow/src/driver.rs index 6585186..1aa4062 100644 --- a/tpchgen-arrow/src/driver.rs +++ b/tpchgen-arrow/src/driver.rs @@ -1,4 +1,4 @@ -use crate::conversions::{decimal128_array_from_iter, string_view_array_from_display_iter}; +use crate::conversions::{string_view_array_from_display_iter}; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; use arrow::array::{Int64Array, RecordBatch, StringViewArray}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; diff --git a/tpchgen-arrow/src/trip.rs b/tpchgen-arrow/src/trip.rs index d6cf080..93b6c61 100644 --- a/tpchgen-arrow/src/trip.rs +++ b/tpchgen-arrow/src/trip.rs @@ -1,6 +1,6 @@ use crate::conversions::{decimal128_array_from_iter, to_arrow_date32}; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; -use arrow::array::{Date32Array, Int64Array, RecordBatch}; +use arrow::array::{Date32Array, Float64Array, Int64Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use std::sync::{Arc, LazyLock, Mutex}; use tpchgen::generators::{Trip, TripGenerator, TripGeneratorIterator}; @@ -79,6 +79,10 @@ impl Iterator for TripArrow { let t_tip = decimal128_array_from_iter(rows.iter().map(|row| row.t_tip)); let t_totalamount = decimal128_array_from_iter(rows.iter().map(|row| row.t_totalamount)); let t_distance = decimal128_array_from_iter(rows.iter().map(|row| row.t_distance)); + let t_pickupx = Float64Array::from_iter_values(rows.iter().map(|row| row.t_pickupx)); + let t_pickupy = Float64Array::from_iter_values(rows.iter().map(|row| row.t_pickupy)); + let t_dropoffx = Float64Array::from_iter_values(rows.iter().map(|row| row.t_dropoffx)); + let t_dropoffy = Float64Array::from_iter_values(rows.iter().map(|row| row.t_dropoffy)); let batch = RecordBatch::try_new( Arc::clone(&self.schema), @@ -93,6 +97,10 @@ impl Iterator for TripArrow { Arc::new(t_tip), Arc::new(t_totalamount), Arc::new(t_distance), + Arc::new(t_pickupx), + Arc::new(t_pickupy), + Arc::new(t_dropoffx), + Arc::new(t_dropoffy), ], ) .unwrap(); @@ -116,5 +124,9 @@ fn make_trip_schema() -> SchemaRef { Field::new("t_tip", DataType::Decimal128(15, 2), false), Field::new("t_totalamount", DataType::Decimal128(15, 2), false), Field::new("t_distance", DataType::Decimal128(15, 2), false), + Field::new("t_pickupx", DataType::Float64, false), + Field::new("t_pickupy", DataType::Float64, false), + Field::new("t_dropoffx", DataType::Float64, false), + Field::new("t_dropoffy", DataType::Float64, false), ])) } \ No newline at end of file diff --git a/tpchgen-arrow/src/vehicle.rs b/tpchgen-arrow/src/vehicle.rs index 84991e2..8c0e789 100644 --- a/tpchgen-arrow/src/vehicle.rs +++ b/tpchgen-arrow/src/vehicle.rs @@ -1,6 +1,6 @@ -use crate::conversions::{decimal128_array_from_iter, string_view_array_from_display_iter}; +use crate::conversions::{string_view_array_from_display_iter}; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; -use arrow::array::{Int32Array, Int64Array, RecordBatch, StringViewArray}; +use arrow::array::{Int64Array, RecordBatch, StringViewArray}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use std::sync::{Arc, LazyLock}; use tpchgen::generators::{VehicleGenerator, VehicleGeneratorIterator}; diff --git a/tpchgen/src/csv.rs b/tpchgen/src/csv.rs index 4718b04..9331f9b 100644 --- a/tpchgen/src/csv.rs +++ b/tpchgen/src/csv.rs @@ -412,7 +412,7 @@ impl<'a> TripCsv { /// Returns the CSV header for the Trip table pub fn header() -> &'static str { - "t_tripkey,t_custkey,t_driverkey,t_vehiclekey,t_pickuptime,t_dropofftime,t_fare,t_tip,t_totalamount,t_distance" + "t_tripkey,t_custkey,t_driverkey,t_vehiclekey,t_pickuptime,t_dropofftime,t_fare,t_tip,t_totalamount,t_distance,t_pickupx,t_pickupy" } } @@ -421,7 +421,7 @@ impl Display for TripCsv { write!( f, // note must quote location and comment fields as they may contain commas - "{},{},{},{},{},{},{},{},{},{}", + "{},{},{},{},{},{},{},{},{},{},{},{}", self.inner.t_tripkey, self.inner.t_custkey, self.inner.t_driverkey, @@ -432,6 +432,8 @@ impl Display for TripCsv { self.inner.t_tip, self.inner.t_totalamount, self.inner.t_distance, + self.inner.t_pickupx, + self.inner.t_pickupy, ) } } diff --git a/tpchgen/src/generators.rs b/tpchgen/src/generators.rs index 292bb77..694ac88 100644 --- a/tpchgen/src/generators.rs +++ b/tpchgen/src/generators.rs @@ -10,9 +10,11 @@ use crate::random::{RandomAlphaNumeric, RandomAlphaNumericInstance}; use crate::text::TextPool; use core::fmt; use std::fmt::Display; - +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use crate::dates::{GenerateUtils, TPCHDate}; use crate::random::{RandomBoundedInt, RandomString, RandomStringSequence, RandomText}; +use crate::spider::SpiderGenerator; /// Generator for Nation table data #[derive(Debug, Clone)] @@ -1995,13 +1997,19 @@ pub struct Trip { pub t_totalamount: TPCHDecimal, /// Trip distance pub t_distance: TPCHDecimal, + /// Trip pickup coordinates + pub t_pickupx: f64, + pub t_pickupy: f64, + /// Trip dropoff coordinates + pub t_dropoffx: f64, + pub t_dropoffy: f64, } impl fmt::Display for Trip { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|", + "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|", self.t_tripkey, self.t_custkey, self.t_driverkey, @@ -2011,7 +2019,11 @@ impl fmt::Display for Trip { self.t_fare, self.t_tip, self.t_totalamount, - self.t_distance + self.t_distance, + self.t_pickupx, + self.t_pickupy, + self.t_dropoffx, + self.t_dropoffy, ) } } @@ -2025,6 +2037,7 @@ pub struct TripGenerator { distributions: Distributions, text_pool: TextPool, distance_kde: crate::kde::DistanceKDE, + spatial_gen: SpiderGenerator, } impl TripGenerator { @@ -2032,8 +2045,6 @@ impl TripGenerator { const SCALE_BASE: i32 = 1_500_000; // Constants for trip generation - const DISTANCE_MIN: i32 = 1; // 1.0 miles - const DISTANCE_MAX: i32 = 500; // 50.0 miles const FARE_MIN_PER_MILE: i32 = 150; // $1.50 per mile const FARE_MAX_PER_MILE: i32 = 300; // $3.00 per mile const TIP_PERCENT_MIN: i32 = 0; // 0% tip @@ -2050,6 +2061,7 @@ impl TripGenerator { Distributions::static_default(), TextPool::get_or_init_default(), crate::kde::default_distance_kde(), + SpiderGenerator::default(), ) } @@ -2061,6 +2073,7 @@ impl TripGenerator { distributions: &'b Distributions, text_pool: &'b TextPool, distance_kde: crate::kde::DistanceKDE, + spatial_gen: SpiderGenerator ) -> TripGenerator { TripGenerator { scale_factor, @@ -2069,6 +2082,7 @@ impl TripGenerator { distributions: distributions.clone(), text_pool: text_pool.clone(), distance_kde, + spatial_gen, } } @@ -2096,6 +2110,7 @@ impl TripGenerator { self.vehicle_count, ), self.distance_kde.clone(), // Add the KDE model + self.spatial_gen.clone(), ) } } @@ -2116,11 +2131,11 @@ pub struct TripGeneratorIterator { driver_key_random: RandomBoundedLong, vehicle_key_random: RandomBoundedLong, pickup_date_random: RandomBoundedInt, - distance_random: RandomBoundedInt, fare_per_mile_random: RandomBoundedInt, tip_percent_random: RandomBoundedInt, trip_minutes_per_mile_random: RandomBoundedInt, distance_kde: crate::kde::DistanceKDE, + spatial_gen: SpiderGenerator, scale_factor: f64, start_index: i64, @@ -2139,6 +2154,7 @@ impl TripGeneratorIterator { start_index: i64, row_count: i64, distance_kde: crate::kde::DistanceKDE, + spatial_gen: SpiderGenerator ) -> Self { // Create all the randomizers let max_customer_key = (CustomerGenerator::SCALE_BASE as f64 * scale_factor) as i64; @@ -2152,13 +2168,7 @@ impl TripGeneratorIterator { let mut pickup_date_random = RandomBoundedInt::new( 831649288, dates::MIN_GENERATE_DATE, - dates::MIN_GENERATE_DATE + dates::TOTAL_DATE_RANGE - TripGenerator::TRIP_DURATION_MAX_PER_MILE * TripGenerator::DISTANCE_MAX / 60 / 24 - ); - - let mut distance_random = RandomBoundedInt::new( - 692134278, - TripGenerator::DISTANCE_MIN, - TripGenerator::DISTANCE_MAX + dates::MIN_GENERATE_DATE + dates::TOTAL_DATE_RANGE ); let mut fare_per_mile_random = RandomBoundedInt::new( @@ -2184,7 +2194,6 @@ impl TripGeneratorIterator { driver_key_random.advance_rows(start_index); vehicle_key_random.advance_rows(start_index); pickup_date_random.advance_rows(start_index); - distance_random.advance_rows(start_index); fare_per_mile_random.advance_rows(start_index); tip_percent_random.advance_rows(start_index); trip_minutes_per_mile_random.advance_rows(start_index); @@ -2194,11 +2203,11 @@ impl TripGeneratorIterator { driver_key_random, vehicle_key_random, pickup_date_random, - distance_random, fare_per_mile_random, tip_percent_random, trip_minutes_per_mile_random, distance_kde, // Store the KDE model + spatial_gen, scale_factor, start_index, @@ -2232,13 +2241,22 @@ impl TripGeneratorIterator { let pickup_date_value = self.pickup_date_random.next_value(); let pickup_date = TPCHDate::new(pickup_date_value); - // let distance_value = self.distance_random.next_value(); - // let distance = TPCHDecimal((distance_value * 10) as i64); // Convert to i64 - // Get distance from KDE model (in miles with decimal precision) let distance_value = self.distance_kde.generate(trip_key as u64); let distance = TPCHDecimal((distance_value * 100.0) as i64); + // Pickup + let (pickup_x, pickup_y) = self.spatial_gen.generate_pickup_point(trip_key as u64); + + // Angle + let angle_seed = crate::spider::spider_seed_for_index(trip_key as u64, 1234); + let mut angle_rng = StdRng::seed_from_u64(angle_seed); + let angle: f64 = angle_rng.gen::<f64>() * std::f64::consts::TAU; + + // Dropoff via polar projection + let dropoff_x = pickup_x + distance_value * angle.cos(); + let dropoff_y = pickup_y + distance_value * angle.sin(); + // Fix multiplication of f64 by integers by using f64 literals let fare_per_mile = self.fare_per_mile_random.next_value() as f64; let fare_value = (distance_value * fare_per_mile) / 100.0; @@ -2268,6 +2286,10 @@ impl TripGeneratorIterator { t_tip: tip, t_totalamount: total, t_distance: distance, + t_pickupx: pickup_x, + t_pickupy: pickup_y, + t_dropoffx: dropoff_x, + t_dropoffy: dropoff_y, } } } @@ -2287,7 +2309,6 @@ impl<'a> Iterator for TripGeneratorIterator { self.driver_key_random.row_finished(); self.vehicle_key_random.row_finished(); self.pickup_date_random.row_finished(); - self.distance_random.row_finished(); self.fare_per_mile_random.row_finished(); self.tip_percent_random.row_finished(); self.trip_minutes_per_mile_random.row_finished(); diff --git a/tpchgen/src/lib.rs b/tpchgen/src/lib.rs index 5eea543..4fee43c 100644 --- a/tpchgen/src/lib.rs +++ b/tpchgen/src/lib.rs @@ -60,3 +60,4 @@ pub mod queries; pub mod random; pub mod text; pub mod kde; +mod spider; diff --git a/tpchgen/src/spider.rs b/tpchgen/src/spider.rs new file mode 100644 index 0000000..d1bf86b --- /dev/null +++ b/tpchgen/src/spider.rs @@ -0,0 +1,131 @@ +use rand::{Rng, SeedableRng}; +use rand::rngs::StdRng; + +#[derive(Debug, Clone, Copy)] +pub enum SpiderDistribution { + Uniform, + Normal { mu: f64, sigma: f64 }, + Diagonal { percentage: f64, buffer: f64 }, + Bit { probability: f64, digits: u32 }, + Sierpinski, +} + +#[derive(Debug, Clone)] +pub struct SpiderConfig { + pub dist: SpiderDistribution, + pub global_seed: u64, + pub affine: Option<[f64; 6]>, +} + +#[derive(Clone, Debug)] +pub struct SpiderGenerator { + pub config: SpiderConfig, +} + +impl SpiderGenerator { + pub fn new(config: SpiderConfig) -> Self { + Self { config } + } + + pub fn generate_point(&self, index: u64) -> (f64, f64) { + let seed = spider_seed_for_index(index, self.config.global_seed); + let mut rng = StdRng::seed_from_u64(seed); + + match self.config.dist { + SpiderDistribution::Uniform => (rng.gen(), rng.gen()), + + SpiderDistribution::Normal { mu, sigma } => { + let x = rand_normal(&mut rng, mu, sigma).clamp(0.0, 1.0); + let y = rand_normal(&mut rng, mu, sigma).clamp(0.0, 1.0); + (x, y) + } + + SpiderDistribution::Diagonal { percentage, buffer } => { + if rng.gen::<f64>() < percentage { + let v = rng.gen(); + (v, v) + } else { + let c: f64 = rng.gen(); + let d: f64 = rand_normal(&mut rng, 0.0, buffer / 5.0); + let x: f64 = (c + d / f64::sqrt(2.0)).clamp(0.0, 1.0); + let y: f64 = (c - d / f64::sqrt(2.0)).clamp(0.0, 1.0); + (x, y) + } + } + + SpiderDistribution::Bit { probability, digits } => { + let x = spider_bit(&mut rng, probability, digits); + let y = spider_bit(&mut rng, probability, digits); + (x, y) + } + + SpiderDistribution::Sierpinski => { + let (mut x, mut y) = (0.0, 0.0); + let a = (0.0, 0.0); + let b = (1.0, 0.0); + let c = (0.5, (3.0f64).sqrt() / 2.0); + for _ in 0..10 { + match rng.gen_range(0..3) { + 0 => { x = (x + a.0) / 2.0; y = (y + a.1) / 2.0; } + 1 => { x = (x + b.0) / 2.0; y = (y + b.1) / 2.0; } + _ => { x = (x + c.0) / 2.0; y = (y + c.1) / 2.0; } + } + } + (x, y) + } + } + } + + pub fn generate_pickup_point(&self, trip_id: u64) -> (f64, f64) { + let (x, y) = self.generate_point(trip_id); + if let Some(aff) = &self.config.affine { + apply_affine(x, y, aff) + } else { + (x, y) + } + } +} + +// Affine transform +fn apply_affine(x: f64, y: f64, m: &[f64; 6]) -> (f64, f64) { + let x_out = m[0] * x + m[1] * y + m[2]; + let y_out = m[3] * x + m[4] * y + m[5]; + (x_out, y_out) +} + +// Deterministic hash (SplitMix64-like) +pub fn spider_seed_for_index(index: u64, global_seed: u64) -> u64 { + let mut z = index.wrapping_add(global_seed).wrapping_add(0x9E3779B97F4A7C15); + z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB); + z ^ (z >> 31) +} + +// Box-Muller transform +fn rand_normal(rng: &mut StdRng, mu: f64, sigma: f64) -> f64 { + let u1: f64 = rng.gen(); + let u2: f64 = rng.gen(); + mu + sigma * (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos() +} + +fn spider_bit(rng: &mut StdRng, prob: f64, digits: u32) -> f64 { + (1..=digits) + .map(|i| if rng.gen::<f64>() < prob { 1.0 / 2f64.powi(i as i32) } else { 0.0 }) + .sum() +} + +// In tpchgen/src/spider.rs + +impl Default for SpiderGenerator { + fn default() -> Self { + let config = SpiderConfig { + dist: SpiderDistribution::Uniform, + global_seed: 42, + affine: Some([ + 58.368269, 0.0, -125.244606, // scale X to 58.37°, offset to -125.24° + 0.0, 25.175375, 24.006328, // scale Y to 25.18°, offset to 24.00° + ]), + }; + SpiderGenerator::new(config) + } +} \ No newline at end of file
