This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git

commit b5c7c572798c773e91af715ca0ecb4978dffd3cd
Author: Pranav Toggi <[email protected]>
AuthorDate: Tue Jun 24 22:46:48 2025 -0700

    add spider implementation for Trip table
---
 tpchgen-arrow/src/customer.rs |   2 +-
 tpchgen-arrow/src/driver.rs   |   2 +-
 tpchgen-arrow/src/trip.rs     |  14 ++++-
 tpchgen-arrow/src/vehicle.rs  |   4 +-
 tpchgen/src/csv.rs            |   6 +-
 tpchgen/src/generators.rs     |  59 +++++++++++++------
 tpchgen/src/lib.rs            |   1 +
 tpchgen/src/spider.rs         | 131 ++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 193 insertions(+), 26 deletions(-)

diff --git a/tpchgen-arrow/src/customer.rs b/tpchgen-arrow/src/customer.rs
index 2009b3f..283a0a1 100644
--- a/tpchgen-arrow/src/customer.rs
+++ b/tpchgen-arrow/src/customer.rs
@@ -1,4 +1,4 @@
-use crate::conversions::{decimal128_array_from_iter, 
string_view_array_from_display_iter};
+use crate::conversions::{string_view_array_from_display_iter};
 use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator};
 use arrow::array::{Int64Array, RecordBatch, StringViewArray};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
diff --git a/tpchgen-arrow/src/driver.rs b/tpchgen-arrow/src/driver.rs
index 6585186..1aa4062 100644
--- a/tpchgen-arrow/src/driver.rs
+++ b/tpchgen-arrow/src/driver.rs
@@ -1,4 +1,4 @@
-use crate::conversions::{decimal128_array_from_iter, 
string_view_array_from_display_iter};
+use crate::conversions::{string_view_array_from_display_iter};
 use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator};
 use arrow::array::{Int64Array, RecordBatch, StringViewArray};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
diff --git a/tpchgen-arrow/src/trip.rs b/tpchgen-arrow/src/trip.rs
index d6cf080..93b6c61 100644
--- a/tpchgen-arrow/src/trip.rs
+++ b/tpchgen-arrow/src/trip.rs
@@ -1,6 +1,6 @@
 use crate::conversions::{decimal128_array_from_iter, to_arrow_date32};
 use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator};
-use arrow::array::{Date32Array, Int64Array, RecordBatch};
+use arrow::array::{Date32Array, Float64Array, Int64Array, RecordBatch};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use std::sync::{Arc, LazyLock, Mutex};
 use tpchgen::generators::{Trip, TripGenerator, TripGeneratorIterator};
@@ -79,6 +79,10 @@ impl Iterator for TripArrow {
         let t_tip = decimal128_array_from_iter(rows.iter().map(|row| 
row.t_tip));
         let t_totalamount = decimal128_array_from_iter(rows.iter().map(|row| 
row.t_totalamount));
         let t_distance = decimal128_array_from_iter(rows.iter().map(|row| 
row.t_distance));
+        let t_pickupx = Float64Array::from_iter_values(rows.iter().map(|row| 
row.t_pickupx));
+        let t_pickupy = Float64Array::from_iter_values(rows.iter().map(|row| 
row.t_pickupy));
+        let t_dropoffx = Float64Array::from_iter_values(rows.iter().map(|row| 
row.t_dropoffx));
+        let t_dropoffy = Float64Array::from_iter_values(rows.iter().map(|row| 
row.t_dropoffy));
 
         let batch = RecordBatch::try_new(
             Arc::clone(&self.schema),
@@ -93,6 +97,10 @@ impl Iterator for TripArrow {
                 Arc::new(t_tip),
                 Arc::new(t_totalamount),
                 Arc::new(t_distance),
+                Arc::new(t_pickupx),
+                Arc::new(t_pickupy),
+                Arc::new(t_dropoffx),
+                Arc::new(t_dropoffy),
             ],
         )
             .unwrap();
@@ -116,5 +124,9 @@ fn make_trip_schema() -> SchemaRef {
         Field::new("t_tip", DataType::Decimal128(15, 2), false),
         Field::new("t_totalamount", DataType::Decimal128(15, 2), false),
         Field::new("t_distance", DataType::Decimal128(15, 2), false),
+        Field::new("t_pickupx", DataType::Float64, false),
+        Field::new("t_pickupy", DataType::Float64, false),
+        Field::new("t_dropoffx", DataType::Float64, false),
+        Field::new("t_dropoffy", DataType::Float64, false),
     ]))
 }
\ No newline at end of file
diff --git a/tpchgen-arrow/src/vehicle.rs b/tpchgen-arrow/src/vehicle.rs
index 84991e2..8c0e789 100644
--- a/tpchgen-arrow/src/vehicle.rs
+++ b/tpchgen-arrow/src/vehicle.rs
@@ -1,6 +1,6 @@
-use crate::conversions::{decimal128_array_from_iter, 
string_view_array_from_display_iter};
+use crate::conversions::{string_view_array_from_display_iter};
 use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator};
-use arrow::array::{Int32Array, Int64Array, RecordBatch, StringViewArray};
+use arrow::array::{Int64Array, RecordBatch, StringViewArray};
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use std::sync::{Arc, LazyLock};
 use tpchgen::generators::{VehicleGenerator, VehicleGeneratorIterator};
diff --git a/tpchgen/src/csv.rs b/tpchgen/src/csv.rs
index 4718b04..9331f9b 100644
--- a/tpchgen/src/csv.rs
+++ b/tpchgen/src/csv.rs
@@ -412,7 +412,7 @@ impl<'a> TripCsv {
 
     /// Returns the CSV header for the Trip table
     pub fn header() -> &'static str {
-        
"t_tripkey,t_custkey,t_driverkey,t_vehiclekey,t_pickuptime,t_dropofftime,t_fare,t_tip,t_totalamount,t_distance"
+        
"t_tripkey,t_custkey,t_driverkey,t_vehiclekey,t_pickuptime,t_dropofftime,t_fare,t_tip,t_totalamount,t_distance,t_pickupx,t_pickupy"
     }
 }
 
@@ -421,7 +421,7 @@ impl Display for TripCsv {
         write!(
             f,
             // note must quote location and comment fields as they may contain 
commas
-            "{},{},{},{},{},{},{},{},{},{}",
+            "{},{},{},{},{},{},{},{},{},{},{},{}",
             self.inner.t_tripkey,
             self.inner.t_custkey,
             self.inner.t_driverkey,
@@ -432,6 +432,8 @@ impl Display for TripCsv {
             self.inner.t_tip,
             self.inner.t_totalamount,
             self.inner.t_distance,
+            self.inner.t_pickupx,
+            self.inner.t_pickupy,
         )
     }
 }
diff --git a/tpchgen/src/generators.rs b/tpchgen/src/generators.rs
index 292bb77..694ac88 100644
--- a/tpchgen/src/generators.rs
+++ b/tpchgen/src/generators.rs
@@ -10,9 +10,11 @@ use crate::random::{RandomAlphaNumeric, 
RandomAlphaNumericInstance};
 use crate::text::TextPool;
 use core::fmt;
 use std::fmt::Display;
-
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
 use crate::dates::{GenerateUtils, TPCHDate};
 use crate::random::{RandomBoundedInt, RandomString, RandomStringSequence, 
RandomText};
+use crate::spider::SpiderGenerator;
 
 /// Generator for Nation table data
 #[derive(Debug, Clone)]
@@ -1995,13 +1997,19 @@ pub struct Trip {
     pub t_totalamount: TPCHDecimal,
     /// Trip distance
     pub t_distance: TPCHDecimal,
+    /// Trip pickup coordinates
+    pub t_pickupx: f64,
+    pub t_pickupy: f64,
+    /// Trip dropoff coordinates
+    pub t_dropoffx: f64,
+    pub t_dropoffy: f64,
 }
 
 impl fmt::Display for Trip {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
             f,
-            "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|",
+            "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|",
             self.t_tripkey,
             self.t_custkey,
             self.t_driverkey,
@@ -2011,7 +2019,11 @@ impl fmt::Display for Trip {
             self.t_fare,
             self.t_tip,
             self.t_totalamount,
-            self.t_distance
+            self.t_distance,
+            self.t_pickupx,
+            self.t_pickupy,
+            self.t_dropoffx,
+            self.t_dropoffy,
         )
     }
 }
@@ -2025,6 +2037,7 @@ pub struct TripGenerator {
     distributions: Distributions,
     text_pool: TextPool,
     distance_kde: crate::kde::DistanceKDE,
+    spatial_gen: SpiderGenerator,
 }
 
 impl TripGenerator {
@@ -2032,8 +2045,6 @@ impl TripGenerator {
     const SCALE_BASE: i32 = 1_500_000;
 
     // Constants for trip generation
-    const DISTANCE_MIN: i32 = 1;   // 1.0 miles
-    const DISTANCE_MAX: i32 = 500; // 50.0 miles
     const FARE_MIN_PER_MILE: i32 = 150; // $1.50 per mile
     const FARE_MAX_PER_MILE: i32 = 300; // $3.00 per mile
     const TIP_PERCENT_MIN: i32 = 0;     // 0% tip
@@ -2050,6 +2061,7 @@ impl TripGenerator {
             Distributions::static_default(),
             TextPool::get_or_init_default(),
             crate::kde::default_distance_kde(),
+            SpiderGenerator::default(),
         )
     }
 
@@ -2061,6 +2073,7 @@ impl TripGenerator {
         distributions: &'b Distributions,
         text_pool: &'b TextPool,
         distance_kde: crate::kde::DistanceKDE,
+        spatial_gen: SpiderGenerator
     ) -> TripGenerator {
         TripGenerator {
             scale_factor,
@@ -2069,6 +2082,7 @@ impl TripGenerator {
             distributions: distributions.clone(),
             text_pool: text_pool.clone(),
             distance_kde,
+            spatial_gen,
         }
     }
 
@@ -2096,6 +2110,7 @@ impl TripGenerator {
                 self.vehicle_count,
             ),
             self.distance_kde.clone(), // Add the KDE model
+            self.spatial_gen.clone(),
         )
     }
 }
@@ -2116,11 +2131,11 @@ pub struct TripGeneratorIterator {
     driver_key_random: RandomBoundedLong,
     vehicle_key_random: RandomBoundedLong,
     pickup_date_random: RandomBoundedInt,
-    distance_random: RandomBoundedInt,
     fare_per_mile_random: RandomBoundedInt,
     tip_percent_random: RandomBoundedInt,
     trip_minutes_per_mile_random: RandomBoundedInt,
     distance_kde: crate::kde::DistanceKDE,
+    spatial_gen: SpiderGenerator,
 
     scale_factor: f64,
     start_index: i64,
@@ -2139,6 +2154,7 @@ impl TripGeneratorIterator {
         start_index: i64,
         row_count: i64,
         distance_kde: crate::kde::DistanceKDE,
+        spatial_gen: SpiderGenerator
     ) -> Self {
         // Create all the randomizers
         let max_customer_key = (CustomerGenerator::SCALE_BASE as f64 * 
scale_factor) as i64;
@@ -2152,13 +2168,7 @@ impl TripGeneratorIterator {
         let mut pickup_date_random = RandomBoundedInt::new(
             831649288,
             dates::MIN_GENERATE_DATE,
-            dates::MIN_GENERATE_DATE + dates::TOTAL_DATE_RANGE - 
TripGenerator::TRIP_DURATION_MAX_PER_MILE * TripGenerator::DISTANCE_MAX / 60 / 
24
-        );
-
-        let mut distance_random = RandomBoundedInt::new(
-            692134278,
-            TripGenerator::DISTANCE_MIN,
-            TripGenerator::DISTANCE_MAX
+            dates::MIN_GENERATE_DATE + dates::TOTAL_DATE_RANGE
         );
 
         let mut fare_per_mile_random = RandomBoundedInt::new(
@@ -2184,7 +2194,6 @@ impl TripGeneratorIterator {
         driver_key_random.advance_rows(start_index);
         vehicle_key_random.advance_rows(start_index);
         pickup_date_random.advance_rows(start_index);
-        distance_random.advance_rows(start_index);
         fare_per_mile_random.advance_rows(start_index);
         tip_percent_random.advance_rows(start_index);
         trip_minutes_per_mile_random.advance_rows(start_index);
@@ -2194,11 +2203,11 @@ impl TripGeneratorIterator {
             driver_key_random,
             vehicle_key_random,
             pickup_date_random,
-            distance_random,
             fare_per_mile_random,
             tip_percent_random,
             trip_minutes_per_mile_random,
             distance_kde, // Store the KDE model
+            spatial_gen,
 
             scale_factor,
             start_index,
@@ -2232,13 +2241,22 @@ impl TripGeneratorIterator {
         let pickup_date_value = self.pickup_date_random.next_value();
         let pickup_date = TPCHDate::new(pickup_date_value);
 
-        // let distance_value = self.distance_random.next_value();
-        // let distance = TPCHDecimal((distance_value * 10) as i64); // 
Convert to i64
-
         // Get distance from KDE model (in miles with decimal precision)
         let distance_value = self.distance_kde.generate(trip_key as u64);
         let distance = TPCHDecimal((distance_value * 100.0) as i64);
 
+        // Pickup
+        let (pickup_x, pickup_y) = 
self.spatial_gen.generate_pickup_point(trip_key as u64);
+
+        // Angle
+        let angle_seed = crate::spider::spider_seed_for_index(trip_key as u64, 
1234);
+        let mut angle_rng = StdRng::seed_from_u64(angle_seed);
+        let angle: f64 = angle_rng.gen::<f64>() * std::f64::consts::TAU;
+
+        // Dropoff via polar projection
+        let dropoff_x = pickup_x + distance_value * angle.cos();
+        let dropoff_y = pickup_y + distance_value * angle.sin();
+
         // Fix multiplication of f64 by integers by using f64 literals
         let fare_per_mile = self.fare_per_mile_random.next_value() as f64;
         let fare_value = (distance_value * fare_per_mile) / 100.0;
@@ -2268,6 +2286,10 @@ impl TripGeneratorIterator {
             t_tip: tip,
             t_totalamount: total,
             t_distance: distance,
+            t_pickupx: pickup_x,
+            t_pickupy: pickup_y,
+            t_dropoffx: dropoff_x,
+            t_dropoffy: dropoff_y,
         }
     }
 }
@@ -2287,7 +2309,6 @@ impl<'a> Iterator for TripGeneratorIterator {
         self.driver_key_random.row_finished();
         self.vehicle_key_random.row_finished();
         self.pickup_date_random.row_finished();
-        self.distance_random.row_finished();
         self.fare_per_mile_random.row_finished();
         self.tip_percent_random.row_finished();
         self.trip_minutes_per_mile_random.row_finished();
diff --git a/tpchgen/src/lib.rs b/tpchgen/src/lib.rs
index 5eea543..4fee43c 100644
--- a/tpchgen/src/lib.rs
+++ b/tpchgen/src/lib.rs
@@ -60,3 +60,4 @@ pub mod queries;
 pub mod random;
 pub mod text;
 pub mod kde;
+mod spider;
diff --git a/tpchgen/src/spider.rs b/tpchgen/src/spider.rs
new file mode 100644
index 0000000..d1bf86b
--- /dev/null
+++ b/tpchgen/src/spider.rs
@@ -0,0 +1,131 @@
+use rand::{Rng, SeedableRng};
+use rand::rngs::StdRng;
+
+#[derive(Debug, Clone, Copy)]
+pub enum SpiderDistribution {
+    Uniform,
+    Normal { mu: f64, sigma: f64 },
+    Diagonal { percentage: f64, buffer: f64 },
+    Bit { probability: f64, digits: u32 },
+    Sierpinski,
+}
+
+#[derive(Debug, Clone)]
+pub struct SpiderConfig {
+    pub dist: SpiderDistribution,
+    pub global_seed: u64,
+    pub affine: Option<[f64; 6]>,
+}
+
+#[derive(Clone, Debug)]
+pub struct SpiderGenerator {
+    pub config: SpiderConfig,
+}
+
+impl SpiderGenerator {
+    pub fn new(config: SpiderConfig) -> Self {
+        Self { config }
+    }
+
+    pub fn generate_point(&self, index: u64) -> (f64, f64) {
+        let seed = spider_seed_for_index(index, self.config.global_seed);
+        let mut rng = StdRng::seed_from_u64(seed);
+
+        match self.config.dist {
+            SpiderDistribution::Uniform => (rng.gen(), rng.gen()),
+
+            SpiderDistribution::Normal { mu, sigma } => {
+                let x = rand_normal(&mut rng, mu, sigma).clamp(0.0, 1.0);
+                let y = rand_normal(&mut rng, mu, sigma).clamp(0.0, 1.0);
+                (x, y)
+            }
+
+            SpiderDistribution::Diagonal { percentage, buffer } => {
+                if rng.gen::<f64>() < percentage {
+                    let v = rng.gen();
+                    (v, v)
+                } else {
+                    let c: f64 = rng.gen();
+                    let d: f64 = rand_normal(&mut rng, 0.0, buffer / 5.0);
+                    let x: f64 = (c + d / f64::sqrt(2.0)).clamp(0.0, 1.0);
+                    let y: f64 = (c - d / f64::sqrt(2.0)).clamp(0.0, 1.0);
+                    (x, y)
+                }
+            }
+
+            SpiderDistribution::Bit { probability, digits } => {
+                let x = spider_bit(&mut rng, probability, digits);
+                let y = spider_bit(&mut rng, probability, digits);
+                (x, y)
+            }
+
+            SpiderDistribution::Sierpinski => {
+                let (mut x, mut y) = (0.0, 0.0);
+                let a = (0.0, 0.0);
+                let b = (1.0, 0.0);
+                let c = (0.5, (3.0f64).sqrt() / 2.0);
+                for _ in 0..10 {
+                    match rng.gen_range(0..3) {
+                        0 => { x = (x + a.0) / 2.0; y = (y + a.1) / 2.0; }
+                        1 => { x = (x + b.0) / 2.0; y = (y + b.1) / 2.0; }
+                        _ => { x = (x + c.0) / 2.0; y = (y + c.1) / 2.0; }
+                    }
+                }
+                (x, y)
+            }
+        }
+    }
+
+    pub fn generate_pickup_point(&self, trip_id: u64) -> (f64, f64) {
+        let (x, y) = self.generate_point(trip_id);
+        if let Some(aff) = &self.config.affine {
+            apply_affine(x, y, aff)
+        } else {
+            (x, y)
+        }
+    }
+}
+
+// Affine transform
+fn apply_affine(x: f64, y: f64, m: &[f64; 6]) -> (f64, f64) {
+    let x_out = m[0] * x + m[1] * y + m[2];
+    let y_out = m[3] * x + m[4] * y + m[5];
+    (x_out, y_out)
+}
+
+// Deterministic hash (SplitMix64-like)
+pub fn spider_seed_for_index(index: u64, global_seed: u64) -> u64 {
+    let mut z = 
index.wrapping_add(global_seed).wrapping_add(0x9E3779B97F4A7C15);
+    z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
+    z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB);
+    z ^ (z >> 31)
+}
+
+// Box-Muller transform
+fn rand_normal(rng: &mut StdRng, mu: f64, sigma: f64) -> f64 {
+    let u1: f64 = rng.gen();
+    let u2: f64 = rng.gen();
+    mu + sigma * (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * 
u2).cos()
+}
+
+fn spider_bit(rng: &mut StdRng, prob: f64, digits: u32) -> f64 {
+    (1..=digits)
+        .map(|i| if rng.gen::<f64>() < prob { 1.0 / 2f64.powi(i as i32) } else 
{ 0.0 })
+        .sum()
+}
+
+// In tpchgen/src/spider.rs
+
+impl Default for SpiderGenerator {
+    fn default() -> Self {
+        let config = SpiderConfig {
+            dist: SpiderDistribution::Uniform,
+            global_seed: 42,
+            affine: Some([
+                58.368269, 0.0, -125.244606, // scale X to 58.37°, offset to 
-125.24°
+                0.0, 25.175375, 24.006328,    // scale Y to 25.18°, offset to 
24.00°
+            ]),
+        };
+        SpiderGenerator::new(config)
+    }
+}
\ No newline at end of file

Reply via email to