This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git


The following commit(s) were added to refs/heads/main by this push:
     new 2be2f35  Improve the random benchmark data generator to generate 
non-identical pairs of geometries (#70)
2be2f35 is described below

commit 2be2f354d8f3f6a556f426df7e6902e9d3bfc691
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Fri Sep 12 22:25:36 2025 +0800

    Improve the random benchmark data generator to generate non-identical pairs 
of geometries (#70)
---
 benchmarks/test_bench_base.py                    | 51 +++++++++++++++++++++---
 python/sedonadb/tests/functions/test_distance.py |  1 +
 python/sedonadb/tests/test_sjoin.py              |  4 +-
 rust/sedona-testing/src/datagen.rs               | 17 +++++---
 4 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/benchmarks/test_bench_base.py b/benchmarks/test_bench_base.py
index 8e62f35..2c8061d 100644
--- a/benchmarks/test_bench_base.py
+++ b/benchmarks/test_bench_base.py
@@ -27,13 +27,13 @@ class TestBenchBase:
         num_geoms = 100_000
 
         # Setup tables
-        for name, options in [
+        for name, base_options in [
             (
                 "segments_large",
                 {
                     "geom_type": "LineString",
                     "target_rows": num_geoms,
-                    "vertices_per_linestring_range": [2, 2],
+                    "vertices_per_linestring_range": [2, 10],
                 },
             ),
             (
@@ -69,13 +69,52 @@ class TestBenchBase:
                 },
             ),
         ]:
-            # Generate synthetic data
+            # Generate synthetic data with two different geometry sets that 
have overlapping spatial distribution
+            # The intersection rate between geom1 and geom2 will be around 2%.
+            # This creates more realistic workloads for spatial predicates.
+
+            # Options for first geometry set (geom1) - left-leaning 
distribution
+            options1 = base_options.copy()
+            options1.update(
+                {
+                    "seed": 42,
+                    "bounds": [0.0, 0.0, 80.0, 100.0],  # Slightly left-leaning
+                    "size_range": [
+                        1.0,
+                        15.0,
+                    ],  # Medium-sized geometries for good intersection chance
+                }
+            )
+
+            # Options for second geometry set (geom2) - right-leaning 
distribution
+            options2 = base_options.copy()
+            options2.update(
+                {
+                    "seed": 43,
+                    "bounds": [20.0, 0.0, 100.0, 100.0],  # Slightly 
right-leaning
+                    "size_range": [1.0, 15.0],  # Same size range for fair 
comparison
+                }
+            )
+
             query = f"""
+                WITH geom1_data AS (
+                    SELECT
+                        geometry as geom1,
+                        row_number() OVER () as id
+                    FROM sd_random_geometry('{json.dumps(options1)}')
+                ),
+                geom2_data AS (
+                    SELECT
+                        geometry as geom2,
+                        row_number() OVER () as id
+                    FROM sd_random_geometry('{json.dumps(options2)}')
+                )
                 SELECT
-                    geometry as geom1,
-                    geometry as geom2,
+                    g1.geom1,
+                    g2.geom2,
                     round(random() * 100) as integer
-                FROM sd_random_geometry('{json.dumps(options)}')
+                FROM geom1_data g1
+                JOIN geom2_data g2 ON g1.id = g2.id
             """
             tab = self.sedonadb.execute_and_collect(query)
 
diff --git a/python/sedonadb/tests/functions/test_distance.py 
b/python/sedonadb/tests/functions/test_distance.py
index 8ca589c..bd520f2 100644
--- a/python/sedonadb/tests/functions/test_distance.py
+++ b/python/sedonadb/tests/functions/test_distance.py
@@ -43,4 +43,5 @@ def test_st_distance(eng, geom1, geom2, expected):
     eng.assert_query_result(
         f"SELECT ST_Distance({geom_or_null(geom1)}, {geom_or_null(geom2)})",
         expected,
+        numeric_epsilon=1e-8,
     )
diff --git a/python/sedonadb/tests/test_sjoin.py 
b/python/sedonadb/tests/test_sjoin.py
index 2a3728f..5c29215 100644
--- a/python/sedonadb/tests/test_sjoin.py
+++ b/python/sedonadb/tests/test_sjoin.py
@@ -104,7 +104,7 @@ def test_spatial_join_geography(join_type, on):
             "vertices_per_linestring_range": [2, 10],
             "bounds": west_most_bound,
             "size_range": [0.1, 5],
-            "seed": 42,
+            "seed": 43,
         }
     )
     df_point = eng_sedonadb.execute_and_collect(
@@ -118,7 +118,7 @@ def test_spatial_join_geography(join_type, on):
             "vertices_per_linestring_range": [2, 10],
             "bounds": east_most_bound,
             "size_range": [0.1, 5],
-            "seed": 43,
+            "seed": 44,
         }
     )
     df_polygon = eng_sedonadb.execute_and_collect(
diff --git a/rust/sedona-testing/src/datagen.rs 
b/rust/sedona-testing/src/datagen.rs
index 76fc2b5..4fb73c2 100644
--- a/rust/sedona-testing/src/datagen.rs
+++ b/rust/sedona-testing/src/datagen.rs
@@ -566,7 +566,8 @@ fn generate_random_linestring<R: rand::Rng>(
         );
         // Always sample in such a way that we end up with a valid linestring
         let num_vertices = rng.sample(vertices_dist).max(2);
-        let coords = generate_circular_vertices(center_x, center_y, half_size, 
num_vertices, false);
+        let coords =
+            generate_circular_vertices(rng, center_x, center_y, half_size, 
num_vertices, false);
         LineString::from(coords)
     }
 }
@@ -582,7 +583,8 @@ fn generate_random_polygon<R: rand::Rng>(rng: &mut R, 
options: &RandomGeometryOp
         );
         // Always sample in such a way that we end up with a valid Polygon
         let num_vertices = rng.sample(vertices_dist).max(3);
-        let coords = generate_circular_vertices(center_x, center_y, half_size, 
num_vertices, true);
+        let coords =
+            generate_circular_vertices(rng, center_x, center_y, half_size, 
num_vertices, true);
         let shell = LineString::from(coords);
         let mut holes = Vec::new();
 
@@ -593,7 +595,7 @@ fn generate_random_polygon<R: rand::Rng>(rng: &mut R, 
options: &RandomGeometryOp
         if add_hole {
             let new_size = half_size * hole_scale_factor;
             let mut coords =
-                generate_circular_vertices(center_x, center_y, new_size, 
num_vertices, true);
+                generate_circular_vertices(rng, center_x, center_y, new_size, 
num_vertices, true);
             coords.reverse();
             holes.push(LineString::from(coords));
         }
@@ -756,7 +758,8 @@ fn generate_non_overlapping_sub_rectangles(num_parts: 
usize, bounds: &Rect) -> V
     tiles
 }
 
-fn generate_circular_vertices(
+fn generate_circular_vertices<R: rand::Rng>(
+    rng: &mut R,
     center_x: f64,
     center_y: f64,
     radius: f64,
@@ -764,7 +767,11 @@ fn generate_circular_vertices(
     closed: bool,
 ) -> Vec<Coord> {
     let mut out = Vec::new();
-    let mut angle: f64 = 0.0;
+
+    // Randomize starting angle (0 to 2 * PI)
+    let start_angle_dist = Uniform::new(0.0, 2.0 * PI);
+    let mut angle: f64 = rng.sample(start_angle_dist);
+
     let dangle = 2.0 * PI / (num_vertices as f64).max(3.0);
     for _ in 0..num_vertices {
         out.push(Coord {

Reply via email to