Re: [PR] feat: Add comprehensive KNN join integration tests and benchmarks [sedona-db]

via GitHub Thu, 11 Sep 2025 13:52:52 -0700


Copilot commented on code in PR #65:
URL: https://github.com/apache/sedona-db/pull/65#discussion_r2342315614



##########
benchmarks/test_knn.py:
##########
@@ -89,138 +96,162 @@ def test_knn_performance(self, benchmark, k, 
use_spheroid, dataset_size):
         else:
             trip_table = "knn_trips_small"
             building_table = "knn_buildings"
-            trip_limit = 500
+            trip_limit = 1000
 
-        spheroid_str = "TRUE" if use_spheroid else "FALSE"
+        # Get the appropriate engine instance
+        eng = self._get_eng(engine)

Review Comment:
   The method `_get_eng` is not defined in this class. This will cause a 
runtime error when the benchmark tries to execute.



##########
python/sedonadb/tests/test_knnjoin.py:
##########
@@ -0,0 +1,439 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import json
+from sedonadb.testing import PostGIS, SedonaDB
+
+
[email protected]("k", [1, 3, 5])
+def test_knn_join_basic(k):
+    """Test basic KNN join functionality with synthetic data"""
+    eng_sedonadb = SedonaDB.create_or_skip()
+    eng_postgis = PostGIS.create_or_skip()
+
+    # Create query points (probe side)
+    point_options = json.dumps(
+        {
+            "geom_type": "Point",
+            "target_rows": 20,
+            "seed": 42,
+        }
+    )
+    df_points = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{point_options}') LIMIT 20"
+    )
+
+    # Create target points (build side)
+    target_options = json.dumps(
+        {
+            "geom_type": "Point",
+            "target_rows": 50,
+            "seed": 43,
+        }
+    )
+    df_targets = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{target_options}') LIMIT 50"
+    )
+
+    # Set up tables in both engines
+    eng_sedonadb.create_table_arrow("knn_query_points", df_points)
+    eng_sedonadb.create_table_arrow("knn_target_points", df_targets)
+    eng_postgis.create_table_arrow("knn_query_points", df_points)
+    eng_postgis.create_table_arrow("knn_target_points", df_targets)
+
+    # SedonaDB syntax using ST_KNN
+    sedonadb_sql = f"""
+        SELECT
+            q.id as query_id,
+            t.id as target_id,
+            ST_Distance(q.geometry, t.geometry) as distance
+        FROM knn_query_points q
+        JOIN knn_target_points t ON ST_KNN(q.geometry, t.geometry, {k}, FALSE)
+        ORDER BY query_id, distance
+    """
+
+    sedonadb_results = 
eng_sedonadb.execute_and_collect(sedonadb_sql).to_pandas()
+
+    # Verify basic correctness
+    assert len(sedonadb_results) > 0
+    assert (
+        len(sedonadb_results) == len(df_points) * k
+    )  # Each query point should have k neighbors
+
+    # Verify results are ordered by distance within each query point
+    for query_id in sedonadb_results["query_id"].unique():
+        query_results = sedonadb_results[sedonadb_results["query_id"] == 
query_id]
+        distances = query_results["distance"].tolist()
+        assert distances == sorted(distances), (
+            f"Distances not sorted for query_id {query_id}: {distances}"
+        )
+
+    # PostGIS syntax using distance operator and window functions for KNN
+    postgis_sql = f"""
+        WITH ranked_neighbors AS (
+            SELECT
+                q.id as query_id,
+                t.id as target_id,
+                ST_Distance(q.geometry, t.geometry) as distance,
+                ROW_NUMBER() OVER (PARTITION BY q.id ORDER BY q.geometry <-> 
t.geometry) as rn
+            FROM knn_query_points q
+            CROSS JOIN knn_target_points t
+        )
+        SELECT query_id, target_id, distance
+        FROM ranked_neighbors
+        WHERE rn <= {k}
+        ORDER BY query_id, distance
+    """
+
+    # Compare with PostGIS (if available)
+    eng_postgis.assert_query_result(postgis_sql, sedonadb_results)
+
+
+def test_knn_join_with_polygons():
+    """Test KNN join between points and polygons"""
+    eng_sedonadb = SedonaDB.create_or_skip()
+    eng_postgis = PostGIS.create_or_skip()
+
+    # Create query points
+    point_options = json.dumps(
+        {
+            "geom_type": "Point",
+            "target_rows": 15,
+            "seed": 100,
+        }
+    )
+    df_points = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{point_options}') LIMIT 20"

Review Comment:
   The LIMIT clause conflicts with the target_rows parameter. The point_options 
specifies target_rows=15 but the query limits to 20 rows, which may cause 
unexpected behavior or inconsistent test data.
   ```suggestion
           f"SELECT * FROM sd_random_geometry('{point_options}') LIMIT 15"
   ```



##########
python/sedonadb/tests/test_knnjoin.py:
##########
@@ -0,0 +1,439 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import json
+from sedonadb.testing import PostGIS, SedonaDB
+
+
[email protected]("k", [1, 3, 5])
+def test_knn_join_basic(k):
+    """Test basic KNN join functionality with synthetic data"""
+    eng_sedonadb = SedonaDB.create_or_skip()
+    eng_postgis = PostGIS.create_or_skip()
+
+    # Create query points (probe side)
+    point_options = json.dumps(
+        {
+            "geom_type": "Point",
+            "target_rows": 20,
+            "seed": 42,
+        }
+    )
+    df_points = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{point_options}') LIMIT 20"
+    )
+
+    # Create target points (build side)
+    target_options = json.dumps(
+        {
+            "geom_type": "Point",
+            "target_rows": 50,
+            "seed": 43,
+        }
+    )
+    df_targets = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{target_options}') LIMIT 50"
+    )
+
+    # Set up tables in both engines
+    eng_sedonadb.create_table_arrow("knn_query_points", df_points)
+    eng_sedonadb.create_table_arrow("knn_target_points", df_targets)
+    eng_postgis.create_table_arrow("knn_query_points", df_points)
+    eng_postgis.create_table_arrow("knn_target_points", df_targets)
+
+    # SedonaDB syntax using ST_KNN
+    sedonadb_sql = f"""
+        SELECT
+            q.id as query_id,
+            t.id as target_id,
+            ST_Distance(q.geometry, t.geometry) as distance
+        FROM knn_query_points q
+        JOIN knn_target_points t ON ST_KNN(q.geometry, t.geometry, {k}, FALSE)
+        ORDER BY query_id, distance
+    """
+
+    sedonadb_results = 
eng_sedonadb.execute_and_collect(sedonadb_sql).to_pandas()
+
+    # Verify basic correctness
+    assert len(sedonadb_results) > 0
+    assert (
+        len(sedonadb_results) == len(df_points) * k
+    )  # Each query point should have k neighbors
+
+    # Verify results are ordered by distance within each query point
+    for query_id in sedonadb_results["query_id"].unique():
+        query_results = sedonadb_results[sedonadb_results["query_id"] == 
query_id]
+        distances = query_results["distance"].tolist()
+        assert distances == sorted(distances), (
+            f"Distances not sorted for query_id {query_id}: {distances}"
+        )
+
+    # PostGIS syntax using distance operator and window functions for KNN
+    postgis_sql = f"""
+        WITH ranked_neighbors AS (
+            SELECT
+                q.id as query_id,
+                t.id as target_id,
+                ST_Distance(q.geometry, t.geometry) as distance,
+                ROW_NUMBER() OVER (PARTITION BY q.id ORDER BY q.geometry <-> 
t.geometry) as rn
+            FROM knn_query_points q
+            CROSS JOIN knn_target_points t
+        )
+        SELECT query_id, target_id, distance
+        FROM ranked_neighbors
+        WHERE rn <= {k}
+        ORDER BY query_id, distance
+    """
+
+    # Compare with PostGIS (if available)
+    eng_postgis.assert_query_result(postgis_sql, sedonadb_results)
+
+
+def test_knn_join_with_polygons():
+    """Test KNN join between points and polygons"""
+    eng_sedonadb = SedonaDB.create_or_skip()
+    eng_postgis = PostGIS.create_or_skip()
+
+    # Create query points
+    point_options = json.dumps(
+        {
+            "geom_type": "Point",
+            "target_rows": 15,
+            "seed": 100,
+        }
+    )
+    df_points = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{point_options}') LIMIT 20"
+    )
+
+    # Create target polygons
+    polygon_options = json.dumps(
+        {
+            "geom_type": "Polygon",
+            "target_rows": 30,
+            "vertices_per_linestring_range": [4, 8],
+            "size_range": [0.001, 0.01],
+            "seed": 101,
+        }
+    )
+    df_polygons = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{polygon_options}') LIMIT 30"
+    )
+
+    # Set up tables
+    eng_sedonadb.create_table_arrow("knn_points", df_points)
+    eng_sedonadb.create_table_arrow("knn_polygons", df_polygons)
+    eng_postgis.create_table_arrow("knn_points", df_points)
+    eng_postgis.create_table_arrow("knn_polygons", df_polygons)
+
+    k = 3
+    # SedonaDB syntax
+    sedonadb_sql = f"""
+        SELECT
+            p.id as point_id,
+            pol.id as polygon_id,
+            ST_Distance(p.geometry, pol.geometry) as distance
+        FROM knn_points p
+        JOIN knn_polygons pol ON ST_KNN(p.geometry, pol.geometry, {k}, FALSE)
+        ORDER BY point_id, distance
+    """
+
+    sedonadb_results = 
eng_sedonadb.execute_and_collect(sedonadb_sql).to_pandas()
+
+    # Verify correctness
+    assert len(sedonadb_results) > 0
+    assert len(sedonadb_results) == len(df_points) * k
+
+    # Verify ordering within each point
+    for point_id in sedonadb_results["point_id"].unique():
+        point_results = sedonadb_results[sedonadb_results["point_id"] == 
point_id]
+        distances = point_results["distance"].tolist()
+        assert distances == sorted(distances), (
+            f"Distances not sorted for point_id {point_id}"
+        )
+
+    # PostGIS syntax
+    postgis_sql = f"""
+        WITH ranked_neighbors AS (
+            SELECT
+                p.id as point_id,
+                pol.id as polygon_id,
+                ST_Distance(p.geometry, pol.geometry) as distance,
+                ROW_NUMBER() OVER (PARTITION BY p.id ORDER BY p.geometry <-> 
pol.geometry) as rn
+            FROM knn_points p
+            CROSS JOIN knn_polygons pol
+        )
+        SELECT point_id, polygon_id, distance
+        FROM ranked_neighbors
+        WHERE rn <= {k}
+        ORDER BY point_id, distance
+    """
+
+    eng_postgis.assert_query_result(postgis_sql, sedonadb_results)
+
+
+def test_knn_join_edge_cases():
+    """Test KNN join edge cases"""
+    eng_sedonadb = SedonaDB.create_or_skip()
+    eng_postgis = PostGIS.create_or_skip()
+
+    # Create small datasets for edge case testing
+    point_options = json.dumps(
+        {
+            "geom_type": "Point",
+            "target_rows": 5,
+            "seed": 200,
+        }
+    )
+    df_points = eng_sedonadb.execute_and_collect(
+        f"SELECT * FROM sd_random_geometry('{point_options}') LIMIT 20"

Review Comment:
   The LIMIT clause conflicts with the target_rows parameter. The point_options 
specifies target_rows=5 but the query limits to 20 rows, which may cause 
unexpected behavior or inconsistent test data.
   ```suggestion
           f"SELECT * FROM sd_random_geometry('{point_options}') LIMIT 5"
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: Add comprehensive KNN join integration tests and benchmarks [sedona-db]

Reply via email to