This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 2be2f35 Improve the random benchmark data generator to generate
non-identical pairs of geometries (#70)
2be2f35 is described below
commit 2be2f354d8f3f6a556f426df7e6902e9d3bfc691
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Fri Sep 12 22:25:36 2025 +0800
Improve the random benchmark data generator to generate non-identical pairs
of geometries (#70)
---
benchmarks/test_bench_base.py | 51 +++++++++++++++++++++---
python/sedonadb/tests/functions/test_distance.py | 1 +
python/sedonadb/tests/test_sjoin.py | 4 +-
rust/sedona-testing/src/datagen.rs | 17 +++++---
4 files changed, 60 insertions(+), 13 deletions(-)
diff --git a/benchmarks/test_bench_base.py b/benchmarks/test_bench_base.py
index 8e62f35..2c8061d 100644
--- a/benchmarks/test_bench_base.py
+++ b/benchmarks/test_bench_base.py
@@ -27,13 +27,13 @@ class TestBenchBase:
num_geoms = 100_000
# Setup tables
- for name, options in [
+ for name, base_options in [
(
"segments_large",
{
"geom_type": "LineString",
"target_rows": num_geoms,
- "vertices_per_linestring_range": [2, 2],
+ "vertices_per_linestring_range": [2, 10],
},
),
(
@@ -69,13 +69,52 @@ class TestBenchBase:
},
),
]:
- # Generate synthetic data
+ # Generate synthetic data with two different geometry sets that
have overlapping spatial distribution
+ # The intersection rate between geom1 and geom2 will be around 2%.
+ # This creates more realistic workloads for spatial predicates.
+
+ # Options for first geometry set (geom1) - left-leaning
distribution
+ options1 = base_options.copy()
+ options1.update(
+ {
+ "seed": 42,
+ "bounds": [0.0, 0.0, 80.0, 100.0], # Slightly left-leaning
+ "size_range": [
+ 1.0,
+ 15.0,
+ ], # Medium-sized geometries for good intersection chance
+ }
+ )
+
+ # Options for second geometry set (geom2) - right-leaning
distribution
+ options2 = base_options.copy()
+ options2.update(
+ {
+ "seed": 43,
+ "bounds": [20.0, 0.0, 100.0, 100.0], # Slightly
right-leaning
+ "size_range": [1.0, 15.0], # Same size range for fair
comparison
+ }
+ )
+
query = f"""
+ WITH geom1_data AS (
+ SELECT
+ geometry as geom1,
+ row_number() OVER () as id
+ FROM sd_random_geometry('{json.dumps(options1)}')
+ ),
+ geom2_data AS (
+ SELECT
+ geometry as geom2,
+ row_number() OVER () as id
+ FROM sd_random_geometry('{json.dumps(options2)}')
+ )
SELECT
- geometry as geom1,
- geometry as geom2,
+ g1.geom1,
+ g2.geom2,
round(random() * 100) as integer
- FROM sd_random_geometry('{json.dumps(options)}')
+ FROM geom1_data g1
+ JOIN geom2_data g2 ON g1.id = g2.id
"""
tab = self.sedonadb.execute_and_collect(query)
diff --git a/python/sedonadb/tests/functions/test_distance.py
b/python/sedonadb/tests/functions/test_distance.py
index 8ca589c..bd520f2 100644
--- a/python/sedonadb/tests/functions/test_distance.py
+++ b/python/sedonadb/tests/functions/test_distance.py
@@ -43,4 +43,5 @@ def test_st_distance(eng, geom1, geom2, expected):
eng.assert_query_result(
f"SELECT ST_Distance({geom_or_null(geom1)}, {geom_or_null(geom2)})",
expected,
+ numeric_epsilon=1e-8,
)
diff --git a/python/sedonadb/tests/test_sjoin.py
b/python/sedonadb/tests/test_sjoin.py
index 2a3728f..5c29215 100644
--- a/python/sedonadb/tests/test_sjoin.py
+++ b/python/sedonadb/tests/test_sjoin.py
@@ -104,7 +104,7 @@ def test_spatial_join_geography(join_type, on):
"vertices_per_linestring_range": [2, 10],
"bounds": west_most_bound,
"size_range": [0.1, 5],
- "seed": 42,
+ "seed": 43,
}
)
df_point = eng_sedonadb.execute_and_collect(
@@ -118,7 +118,7 @@ def test_spatial_join_geography(join_type, on):
"vertices_per_linestring_range": [2, 10],
"bounds": east_most_bound,
"size_range": [0.1, 5],
- "seed": 43,
+ "seed": 44,
}
)
df_polygon = eng_sedonadb.execute_and_collect(
diff --git a/rust/sedona-testing/src/datagen.rs
b/rust/sedona-testing/src/datagen.rs
index 76fc2b5..4fb73c2 100644
--- a/rust/sedona-testing/src/datagen.rs
+++ b/rust/sedona-testing/src/datagen.rs
@@ -566,7 +566,8 @@ fn generate_random_linestring<R: rand::Rng>(
);
// Always sample in such a way that we end up with a valid linestring
let num_vertices = rng.sample(vertices_dist).max(2);
- let coords = generate_circular_vertices(center_x, center_y, half_size,
num_vertices, false);
+ let coords =
+ generate_circular_vertices(rng, center_x, center_y, half_size,
num_vertices, false);
LineString::from(coords)
}
}
@@ -582,7 +583,8 @@ fn generate_random_polygon<R: rand::Rng>(rng: &mut R,
options: &RandomGeometryOp
);
// Always sample in such a way that we end up with a valid Polygon
let num_vertices = rng.sample(vertices_dist).max(3);
- let coords = generate_circular_vertices(center_x, center_y, half_size,
num_vertices, true);
+ let coords =
+ generate_circular_vertices(rng, center_x, center_y, half_size,
num_vertices, true);
let shell = LineString::from(coords);
let mut holes = Vec::new();
@@ -593,7 +595,7 @@ fn generate_random_polygon<R: rand::Rng>(rng: &mut R,
options: &RandomGeometryOp
if add_hole {
let new_size = half_size * hole_scale_factor;
let mut coords =
- generate_circular_vertices(center_x, center_y, new_size,
num_vertices, true);
+ generate_circular_vertices(rng, center_x, center_y, new_size,
num_vertices, true);
coords.reverse();
holes.push(LineString::from(coords));
}
@@ -756,7 +758,8 @@ fn generate_non_overlapping_sub_rectangles(num_parts:
usize, bounds: &Rect) -> V
tiles
}
-fn generate_circular_vertices(
+fn generate_circular_vertices<R: rand::Rng>(
+ rng: &mut R,
center_x: f64,
center_y: f64,
radius: f64,
@@ -764,7 +767,11 @@ fn generate_circular_vertices(
closed: bool,
) -> Vec<Coord> {
let mut out = Vec::new();
- let mut angle: f64 = 0.0;
+
+ // Randomize starting angle (0 to 2 * PI)
+ let start_angle_dist = Uniform::new(0.0, 2.0 * PI);
+ let mut angle: f64 = rng.sample(start_angle_dist);
+
let dangle = 2.0 * PI / (num_vertices as f64).max(3.0);
for _ in 0..num_vertices {
out.push(Coord {