(sedona) branch master updated: [GH-2331] Geopandas: Document differences of sindex compared to gpd + sindex fixes (#2332)

jiayu Fri, 05 Sep 2025 14:34:19 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new 33c9a3d0cf [GH-2331] Geopandas: Document differences of sindex 
compared to gpd + sindex fixes (#2332)
33c9a3d0cf is described below

commit 33c9a3d0cf7566981159714db7696cfb56a91b2b
Author: Peter Nguyen <[email protected]>
AuthorDate: Fri Sep 5 14:34:05 2025 -0700

    [GH-2331] Geopandas: Document differences of sindex compared to gpd + 
sindex fixes (#2332)
---
 python/sedona/spark/geopandas/geoseries.py |  4 +-
 python/sedona/spark/geopandas/sindex.py    | 73 ++++++++++++++++++++----------
 python/tests/geopandas/test_sindex.py      | 55 ++++++++++++++++++++--
 3 files changed, 100 insertions(+), 32 deletions(-)

diff --git a/python/sedona/spark/geopandas/geoseries.py 
b/python/sedona/spark/geopandas/geoseries.py
index 77e6d349c4..b9fb417a8a 100644
--- a/python/sedona/spark/geopandas/geoseries.py
+++ b/python/sedona/spark/geopandas/geoseries.py
@@ -751,9 +751,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         if geometry_column is None:
             raise ValueError("No geometry column found in GeoSeries")
         if self._sindex is None:
-            self._sindex = SpatialIndex(
-                self._internal.spark_frame, column_name=geometry_column
-            )
+            self._sindex = SpatialIndex(self)
         return self._sindex
 
     @property
diff --git a/python/sedona/spark/geopandas/sindex.py 
b/python/sedona/spark/geopandas/sindex.py
index 1b4829351c..4629289ec9 100644
--- a/python/sedona/spark/geopandas/sindex.py
+++ b/python/sedona/spark/geopandas/sindex.py
@@ -38,12 +38,23 @@ class SpatialIndex:
 
         Parameters
         ----------
-        geometry : np.array of Shapely geometries, PySparkDataFrame column, or 
PySparkDataFrame
+        geometry : np.array of Shapely geometries, GeoSeries, or 
PySparkDataFrame
         index_type : str, default "strtree"
             The type of spatial index to use.
         column_name : str, optional
             The column name to extract geometry from if `geometry` is a 
PySparkDataFrame.
+
+        Note: query methods (ie. query, nearest, intersection) have different 
behaviors depending on how the index is constructed.
+        When constructed from a np.array, the query methods return indices 
like original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, the query 
methods return geometries.
         """
+        from sedona.spark.geopandas import GeoSeries
+
+        if isinstance(geometry, GeoSeries):
+            from sedona.spark.geopandas.geoseries import _get_series_col_name
+
+            column_name = _get_series_col_name(geometry)
+            geometry = geometry._internal.spark_frame
 
         if isinstance(geometry, np.ndarray):
             self.geometry = geometry
@@ -65,7 +76,7 @@ class SpatialIndex:
             self._build_spark_index(column_name)
         else:
             raise TypeError(
-                "Invalid type for `geometry`. Expected np.array or 
PySparkDataFrame."
+                "Invalid type for `geometry`. Expected np.array, GeoSeries, or 
PySparkDataFrame."
             )
 
     def query(self, geometry: BaseGeometry, predicate: str = None, sort: bool 
= False):
@@ -82,12 +93,18 @@ class SpatialIndex:
         sort : bool, optional, default False
             Whether to sort the results.
 
+        Note: query() has different behaviors depending on how the index is 
constructed.
+        When constructed from a np.array, this method returns indices like 
original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, this method 
returns geometries.
+
         Note: Unlike Geopandas, Sedona does not support geometry input of type 
np.array or GeoSeries.
+        It is recommended to instead use GeoSeries.intersects directly.
 
         Returns
         -------
         list
-            List of indices of matching geometries.
+            List of geometries if constructed from a GeoSeries or 
PySparkDataFrame.
+            List of the corresponding indices if constructed from a np.array.
         """
 
         if not isinstance(geometry, BaseGeometry):
@@ -96,7 +113,7 @@ class SpatialIndex:
             )
 
         log_advice(
-            "`query` returns local list of indices of matching geometries onto 
driver's memory. "
+            "`query` returns a local list onto driver's memory. "
             "It should only be used if the resulting collection is expected to 
be small."
         )
 
@@ -170,10 +187,15 @@ class SpatialIndex:
 
         Note: Unlike Geopandas, Sedona does not support geometry input of type 
np.array or GeoSeries.
 
+        Note: nearest() has different behaviors depending on how the index is 
constructed.
+        When constructed from a np.array, this method returns indices like 
original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, this method 
returns geometries.
+
         Returns
         -------
         list or tuple
-            List of indices of nearest geometries, optionally with distances.
+            List of geometries if constructed from a GeoSeries or 
PySparkDataFrame.
+            List of the corresponding indices if constructed from a np.array.
         """
 
         if not isinstance(geometry, BaseGeometry):
@@ -194,15 +216,18 @@ class SpatialIndex:
             from sedona.spark.core.spatialOperator import KNNQuery
 
             # Execute the KNN query
-            results = KNNQuery.SpatialKnnQuery(self._indexed_rdd, geometry, k, 
False)
+            geo_data_list = KNNQuery.SpatialKnnQuery(
+                self._indexed_rdd, geometry, k, False
+            )
+
+            # No need to keep the userData field, so convert it directly to a 
list of geometries
+            geoms_list = [row.geom for row in geo_data_list]
 
             if return_distance:
                 # Calculate distances if requested
-                distances = [
-                    geom.distance(geometry) for geom in [row.geom for row in 
results]
-                ]
-                return results, distances
-            return results
+                distances = [geom.distance(geometry) for geom in geoms_list]
+                return geoms_list, distances
+            return geoms_list
         else:
             # For local spatial index based on Shapely STRtree
             if k > len(self.geometry):
@@ -220,20 +245,29 @@ class SpatialIndex:
 
     def intersection(self, bounds):
         """
-        Find geometries that intersect the given bounding box.
+        Find geometries that intersect the given bounding box. Similar to the 
Geopandas version,
+        this is a compatibility wrapper for rtree.index.Index.intersection, 
use query instead.
 
         Parameters
         ----------
         bounds : tuple
             Bounding box as (min_x, min_y, max_x, max_y).
 
+        Note: intersection() has different behaviors depending on how the 
index is constructed.
+        When constructed from a np.array, this method returns indices like 
original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, this method 
returns geometries.
+
+        Note: Unlike Geopandas, Sedona does not support geometry input of type 
np.array or GeoSeries.
+        It is recommended to instead use GeoSeries.intersects directly.
+
         Returns
         -------
         list
-            List of indices of matching geometries.
+            List of geometries if constructed from a GeoSeries or 
PySparkDataFrame.
+            List of the corresponding indices if constructed from a np.array.
         """
         log_advice(
-            "`intersection` returns local list of indices of matching 
geometries onto driver's memory. "
+            "`intersection` returns local list of matching geometries onto 
driver's memory. "
             "It should only be used if the resulting collection is expected to 
be small."
         )
 
@@ -246,16 +280,7 @@ class SpatialIndex:
         bbox = box(*bounds)
 
         if self._is_spark:
-            # For Spark-based spatial index
-            from sedona.spark.core.spatialOperator import RangeQuery
-
-            # Execute the spatial range query with the bounding box
-            result_rdd = RangeQuery.SpatialRangeQuery(
-                self._indexed_rdd, bbox, True, True
-            )
-
-            results = result_rdd.collect()
-            return results
+            return self.query(bbox, predicate="intersects")
         else:
             # For local spatial index based on Shapely STRtree
             try:
diff --git a/python/tests/geopandas/test_sindex.py 
b/python/tests/geopandas/test_sindex.py
index 83c308fdbb..6e68fc40b1 100644
--- a/python/tests/geopandas/test_sindex.py
+++ b/python/tests/geopandas/test_sindex.py
@@ -19,7 +19,7 @@ import pytest
 import numpy as np
 import shapely
 from pyspark.sql.functions import expr
-from shapely.geometry import Point, Polygon, LineString
+from shapely.geometry import Point, Polygon, LineString, box
 
 from tests.test_base import TestBase
 from sedona.spark.geopandas import GeoSeries
@@ -63,6 +63,31 @@ class TestSpatialIndex(TestBase):
             ]
         )
 
+    def test_construct_from_geoseries(self):
+        # Construct from a GeoSeries
+        gs = GeoSeries([Point(x, x) for x in range(5)])
+        sindex = SpatialIndex(gs)
+        result = sindex.query(Point(2, 2))
+        # SpatialIndex constructed from GeoSeries return geometries
+        assert result == [Point(2, 2)]
+
+    def test_construct_from_pyspark_dataframe(self):
+        # Construct from PySparkDataFrame
+        df = self.spark.createDataFrame(
+            [(Point(x, x),) for x in range(5)], ["geometry"]
+        )
+        sindex = SpatialIndex(df, column_name="geometry")
+        result = sindex.query(Point(2, 2))
+        assert result == [Point(2, 2)]
+
+    def test_construct_from_nparray(self):
+        # Construct from np.array
+        array = np.array([Point(x, x) for x in range(5)])
+        sindex = SpatialIndex(array)
+        result = sindex.query(Point(2, 2))
+        # Returns indices like original geopandas
+        assert result == np.array([2])
+
     def test_geoseries_sindex_property_exists(self):
         """Test that the sindex property exists on GeoSeries."""
         assert hasattr(self.points, "sindex")
@@ -182,7 +207,7 @@ class TestSpatialIndex(TestBase):
         assert len(nearest_result) == 1
 
         # The nearest point should have id=2 (POINT(1 1))
-        assert nearest_result[0].geom.wkt == "POINT (1 1)"
+        assert nearest_result[0].wkt == "POINT (1 1)"
 
         # Test finding k=2 nearest neighbors
         nearest_2_results = spark_sindex.nearest(query_point, k=2)
@@ -219,7 +244,7 @@ class TestSpatialIndex(TestBase):
 
         # Should find polygon containing the point
         assert len(nearest_geom) == 1
-        assert "POLYGON" in nearest_geom[0].geom.wkt
+        assert "POLYGON" in nearest_geom[0].wkt
 
         # Test with linestring query
         query_line = LineString([(1.5, 1.5), (2.5, 2.5)])
@@ -343,7 +368,12 @@ class TestSpatialIndex(TestBase):
         result_rows = spark_sindex.intersection(bounds)
 
         # Verify correct results are returned
-        assert len(result_rows) >= 2
+        expected = [
+            Polygon([(1, 1), (2, 1), (2, 2), (1, 2), (1, 1)]),
+            Polygon([(2, 2), (3, 2), (3, 3), (2, 3), (2, 2)]),
+            Polygon([(3, 3), (4, 3), (4, 4), (3, 4), (3, 3)]),
+        ]
+        assert result_rows == expected
 
         # Test with bounds that don't intersect any geometry
         empty_bounds = (10, 10, 11, 11)
@@ -353,7 +383,14 @@ class TestSpatialIndex(TestBase):
         # Test with bounds that cover all geometries
         full_bounds = (-1, -1, 6, 6)
         full_results = spark_sindex.intersection(full_bounds)
-        assert len(full_results) == 5  # Should match all 5 polygons
+        expected = [
+            Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]),
+            Polygon([(1, 1), (2, 1), (2, 2), (1, 2), (1, 1)]),
+            Polygon([(2, 2), (3, 2), (3, 3), (2, 3), (2, 2)]),
+            Polygon([(3, 3), (4, 3), (4, 4), (3, 4), (3, 3)]),
+            Polygon([(4, 4), (5, 4), (5, 5), (4, 5), (4, 4)]),
+        ]
+        assert full_results == expected
 
     def test_intersection_with_points(self):
         """Test the intersection method with point geometries."""
@@ -426,3 +463,11 @@ class TestSpatialIndex(TestBase):
 
         # Verify results
         assert len(results) == 2
+
+    # test from the geopandas docstring
+    def test_geoseries_sindex_intersection(self):
+        gs = GeoSeries([Point(x, x) for x in range(10)])
+        result = gs.sindex.intersection(box(1, 1, 3, 3).bounds)
+        # Unlike original geopandas, this returns geometries instead of indices
+        expected = [Point(1, 1), Point(2, 2), Point(3, 3)]
+        assert result == expected

(sedona) branch master updated: [GH-2331] Geopandas: Document differences of sindex compared to gpd + sindex fixes (#2332)

Reply via email to