This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new 3c954c56cb [GH-2215] Retain sindex, make sindex.query return 
geometries, implement has_sindex (#2216)
3c954c56cb is described below

commit 3c954c56cb333614c2b2da68e99bd65e61afeddb
Author: Peter Nguyen <[email protected]>
AuthorDate: Sun Aug 3 00:15:43 2025 -0700

    [GH-2215] Retain sindex, make sindex.query return geometries, implement 
has_sindex (#2216)
    
    * Retain sindex, make sindex.query return geometries, implement has_sindex
    
    * empty commit
    
    * Update python/sedona/geopandas/base.py
    
    Co-authored-by: Copilot <[email protected]>
    
    ---------
    
    Co-authored-by: Copilot <[email protected]>
---
 python/sedona/geopandas/base.py          | 78 ++++++++++++++++++++++----------
 python/sedona/geopandas/geodataframe.py  | 21 +--------
 python/sedona/geopandas/geoseries.py     | 36 ++++++---------
 python/sedona/geopandas/sindex.py        |  6 ++-
 python/tests/geopandas/test_geoseries.py | 42 +++++++++++++++++
 5 files changed, 115 insertions(+), 68 deletions(-)

diff --git a/python/sedona/geopandas/base.py b/python/sedona/geopandas/base.py
index d7837bec8e..6098c7f245 100644
--- a/python/sedona/geopandas/base.py
+++ b/python/sedona/geopandas/base.py
@@ -47,41 +47,69 @@ class GeoFrame(metaclass=ABCMeta):
     A base class for both GeoDataFrame and GeoSeries.
     """
 
-    # def _reduce_for_geostat_function(
-    #     self,
-    #     sfun: Callable[["GeoSeries"], Column],
-    #     name: str,
-    #     axis: Optional[Axis] = None,
-    #     numeric_only: bool = True,
-    #     skipna: bool = True,
-    #     **kwargs: Any,
-    # ) -> Union["GeoSeries", Scalar]:
-    #     raise NotImplementedError("This method is not implemented yet.")
-
     @property
-    @abstractmethod
     def sindex(self) -> "SpatialIndex":
         """
-        Returns a spatial index built from the geometries.
+        Returns a spatial index for the GeoSeries.
+
+        Note that the spatial index may not be fully
+        initialized until the first use.
+
+        Currently, sindex is not retained when calling this method from a 
GeoDataFrame.
+        You can workaround this by first extracting the active geometry column 
as a GeoSeries,
+        and calling this method.
 
         Returns
         -------
         SpatialIndex
-            The spatial index for this GeoDataFrame.
+            The spatial index.
 
         Examples
         --------
-        >>> from shapely.geometry import Point
-        >>> from sedona.geopandas import GeoDataFrame
+        >>> from shapely.geometry import Point, box
+        >>> from sedona.geopandas import GeoSeries
         >>>
-        >>> gdf = GeoDataFrame([{"geometry": Point(1, 1), "value": 1},
-        ...                     {"geometry": Point(2, 2), "value": 2}])
-        >>> index = gdf.sindex
-        >>> index.size
-        2
+        >>> s = GeoSeries([Point(x, x) for x in range(5)])
+        >>> s.sindex.query(box(1, 1, 3, 3))
+        [Point(1, 1), Point(2, 2), Point(3, 3)]
+        >>> s.has_sindex
+        True
+        """
+        return _delegate_to_geometry_column("sindex", self)
+
+    @property
+    def has_sindex(self):
+        """Check the existence of the spatial index without generating it.
+
+        Use the `.sindex` attribute on a GeoDataFrame or GeoSeries
+        to generate a spatial index if it does not yet exist,
+        which may take considerable time based on the underlying index
+        implementation.
+
+        Note that the underlying spatial index may not be fully
+        initialized until the first use.
+
+        Currently, sindex is not retained when calling this method from a 
GeoDataFrame.
+        You can workaround this by first extracting the active geometry column 
as a GeoSeries,
+        and calling this method.
+
+        Examples
+        --------
+        >>> from shapely.geometry import Point
+        >>> s = GeoSeries([Point(x, x) for x in range(5)])
+        >>> s.has_sindex
+        False
+        >>> index = s.sindex
+        >>> s.has_sindex
+        True
+
+        Returns
+        -------
+        bool
+            `True` if the spatial index has been generated or
+            `False` if not.
         """
-        # We pass in self.geometry here to use the active geometry column for 
dataframe
-        return _delegate_to_geometry_column("sindex", self.geometry)
+        return _delegate_to_geometry_column("has_sindex", self)
 
     @abstractmethod
     def copy(self: GeoFrameLike) -> GeoFrameLike:
@@ -2300,6 +2328,7 @@ class GeoFrame(metaclass=ABCMeta):
 
 def _delegate_to_geometry_column(op, this, *args, **kwargs):
     geom_column = this.geometry
+    inplace = kwargs.pop("inplace", False)
     if args or kwargs:
         data = getattr(geom_column, op)(*args, **kwargs)
     else:
@@ -2308,7 +2337,8 @@ def _delegate_to_geometry_column(op, this, *args, 
**kwargs):
         if callable(data):
             data = data()
 
-    if kwargs.get("inplace", False):
+    if inplace:
+        # This assumes this is a GeoSeries
         this._update_inplace(geom_column)
         return None
 
diff --git a/python/sedona/geopandas/geodataframe.py 
b/python/sedona/geopandas/geodataframe.py
index fa4b1bc825..2d35b78dde 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -29,20 +29,15 @@ import pandas as pd
 import pyspark.pandas as pspd
 import sedona.geopandas as sgpd
 from pyspark.pandas import Series as PandasOnSparkSeries
-from pyspark.pandas._typing import Dtype
 from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
-from pyspark.pandas.internal import InternalFrame
 from pyspark.pandas.utils import log_advice
 
 from sedona.geopandas._typing import Label
 from sedona.geopandas.base import GeoFrame
-from sedona.geopandas.sindex import SpatialIndex
 
 from pandas.api.extensions import register_extension_dtype
 from geopandas.geodataframe import crs_mismatch_error
 from geopandas.array import GeometryDtype
-from shapely.geometry.base import BaseGeometry
-from pyspark.pandas.internal import SPARK_DEFAULT_INDEX_NAME, 
NATURAL_ORDER_COLUMN_NAME
 
 register_extension_dtype(GeometryDtype)
 
@@ -666,6 +661,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
 
         frame._geometry_column_name = geo_column_name
         if new_series:
+            # Note: This casts GeoSeries back into pspd.Series, so we lose any 
metadata that's not serialized
             frame[geo_column_name] = level
 
         if not inplace:
@@ -794,21 +790,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         """
         return pspd.DataFrame(self._internal)
 
-    @property
-    def sindex(self) -> SpatialIndex | None:
-        """
-        Returns a spatial index for the GeoDataFrame.
-        The spatial index allows for efficient spatial queries. If the spatial
-        index cannot be created (e.g., no geometry column is present), this
-        property will return None.
-        Returns:
-        - SpatialIndex: The spatial index for the GeoDataFrame.
-        - None: If the spatial index is not supported.
-        """
-        if "geometry" in self.columns:
-            return SpatialIndex(self._internal.spark_frame, 
column_name="geometry")
-        return None
-
     def copy(self, deep=False):
         """
         Make a copy of this GeoDataFrame object.
diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index 5f2b8dcb19..855f5e60bb 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -352,6 +352,7 @@ class GeoSeries(GeoFrame, pspd.Series):
 
         self._anchor: GeoDataFrame
         self._col_label: Label
+        self._sindex: SpatialIndex = None
 
         if isinstance(
             data, (GeoDataFrame, GeoSeries, PandasOnSparkSeries, 
PandasOnSparkDataFrame)
@@ -626,7 +627,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         result = self._query_geometry_column(spark_col, keep_name=True)
 
         if inplace:
-            self._update_inplace(result)
+            self._update_inplace(result, invalidate_sindex=False)
             return None
 
         return result
@@ -750,29 +751,18 @@ class GeoSeries(GeoFrame, pspd.Series):
 
     @property
     def sindex(self) -> SpatialIndex:
-        """
-        Returns a spatial index built from the geometries.
-
-        Returns
-        -------
-        SpatialIndex
-            The spatial index for this GeoDataFrame.
-
-        Examples
-        --------
-        >>> from shapely.geometry import Point
-        >>> from sedona.geopandas import GeoDataFrame
-        >>>
-        >>> gdf = GeoDataFrame([{"geometry": Point(1, 1), "value": 1},
-        ...                     {"geometry": Point(2, 2), "value": 2}])
-        >>> index = gdf.sindex
-        >>> index.size
-        2
-        """
         geometry_column = _get_series_col_name(self)
         if geometry_column is None:
             raise ValueError("No geometry column found in GeoSeries")
-        return SpatialIndex(self._internal.spark_frame, 
column_name=geometry_column)
+        if self._sindex is None:
+            self._sindex = SpatialIndex(
+                self._internal.spark_frame, column_name=geometry_column
+            )
+        return self._sindex
+
+    @property
+    def has_sindex(self):
+        return self._sindex is not None
 
     def copy(self, deep=False):
         """Make a copy of this GeoSeries object.
@@ -2917,9 +2907,11 @@ e": "Feature", "properties": {}, "geometry": {"type": 
"Point", "coordinates": [3
     # # Utils
     # 
-----------------------------------------------------------------------------
 
-    def _update_inplace(self, result: "GeoSeries"):
+    def _update_inplace(self, result: "GeoSeries", invalidate_sindex: bool = 
True):
         self.rename(result.name, inplace=True)
         self._update_anchor(result._anchor)
+        if invalidate_sindex:
+            self._sindex = None
 
     def _make_series_of_val(self, value: Any):
         """
diff --git a/python/sedona/geopandas/sindex.py 
b/python/sedona/geopandas/sindex.py
index 638344ac36..8e5e515670 100644
--- a/python/sedona/geopandas/sindex.py
+++ b/python/sedona/geopandas/sindex.py
@@ -117,8 +117,10 @@ class SpatialIndex:
                     self._indexed_rdd, geometry, True, True
                 )
 
-            results = result_rdd.collect()
-            return results
+            geo_data_list = result_rdd.collect()
+            # No need to keep the userData field, so convert it directly to a 
list of geometries
+            geoms_list = [row.geom for row in geo_data_list]
+            return geoms_list
         else:
             # For local spatial index based on Shapely STRtree
             if predicate == "contains":
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index 5f9b439874..625df0e8e5 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -113,6 +113,48 @@ class TestGeoSeries(TestGeopandasBase):
         ps_df = ps.Series(data, index=index)
         assert_series_equal(result.to_pandas(), ps_df.to_pandas())
 
+    def test_sindex(self):
+        s = GeoSeries([Point(x, x) for x in range(5)])
+        assert not s.has_sindex
+
+        result = s.sindex.query(box(1, 1, 3, 3))
+        expected = [Point(1, 1), Point(2, 2), Point(3, 3)]
+        assert result == expected
+        assert s.has_sindex
+
+        result = s.sindex.query(box(1, 1, 3, 3), predicate="contains")
+        expected = [Point(1, 1), Point(2, 2), Point(3, 3)]
+        assert result == expected
+        assert s.has_sindex
+
+        # Check that it works with a GeoDataFrame
+        gdf = s.to_geoframe()
+        result = gdf.sindex.query(box(1, 1, 3, 3), predicate="contains")
+        assert result == expected
+
+        # This is challenging to support due to gdf.__setitem__ casting 
GeoSeries into pspd.Series
+        # assert gdf.has_sindex
+
+    def test_invalidate_sindex(self):
+        geoseries = GeoSeries([Point(0, 0), None, Point(2, 2)])
+
+        line = LineString([(1, 1), (3, 3)])
+        result1 = geoseries.sindex.query(line)
+        assert len(result1) == 1
+        assert geoseries.has_sindex
+
+        # Fill the None element with a new geometry that intersects with the 
line
+        # This should invalidate the sindex
+        geoseries.fillna(Point(1, 1), inplace=True)
+        assert not geoseries.has_sindex
+
+        result = geoseries.sindex.query(line)
+        assert len(result) == 2
+
+        # For set_crs, no need to invalidate the sindex
+        geoseries.set_crs(4326, inplace=True)
+        assert geoseries.has_sindex
+
     def test_plot(self):
         # Just make sure it doesn't error
         self.geoseries.plot()

Reply via email to