This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 3c954c56cb [GH-2215] Retain sindex, make sindex.query return
geometries, implement has_sindex (#2216)
3c954c56cb is described below
commit 3c954c56cb333614c2b2da68e99bd65e61afeddb
Author: Peter Nguyen <[email protected]>
AuthorDate: Sun Aug 3 00:15:43 2025 -0700
[GH-2215] Retain sindex, make sindex.query return geometries, implement
has_sindex (#2216)
* Retain sindex, make sindex.query return geometries, implement has_sindex
* empty commit
* Update python/sedona/geopandas/base.py
Co-authored-by: Copilot <[email protected]>
---------
Co-authored-by: Copilot <[email protected]>
---
python/sedona/geopandas/base.py | 78 ++++++++++++++++++++++----------
python/sedona/geopandas/geodataframe.py | 21 +--------
python/sedona/geopandas/geoseries.py | 36 ++++++---------
python/sedona/geopandas/sindex.py | 6 ++-
python/tests/geopandas/test_geoseries.py | 42 +++++++++++++++++
5 files changed, 115 insertions(+), 68 deletions(-)
diff --git a/python/sedona/geopandas/base.py b/python/sedona/geopandas/base.py
index d7837bec8e..6098c7f245 100644
--- a/python/sedona/geopandas/base.py
+++ b/python/sedona/geopandas/base.py
@@ -47,41 +47,69 @@ class GeoFrame(metaclass=ABCMeta):
A base class for both GeoDataFrame and GeoSeries.
"""
- # def _reduce_for_geostat_function(
- # self,
- # sfun: Callable[["GeoSeries"], Column],
- # name: str,
- # axis: Optional[Axis] = None,
- # numeric_only: bool = True,
- # skipna: bool = True,
- # **kwargs: Any,
- # ) -> Union["GeoSeries", Scalar]:
- # raise NotImplementedError("This method is not implemented yet.")
-
@property
- @abstractmethod
def sindex(self) -> "SpatialIndex":
"""
- Returns a spatial index built from the geometries.
+ Returns a spatial index for the GeoSeries.
+
+ Note that the spatial index may not be fully
+ initialized until the first use.
+
+ Currently, sindex is not retained when calling this method from a
GeoDataFrame.
+ You can workaround this by first extracting the active geometry column
as a GeoSeries,
+ and calling this method.
Returns
-------
SpatialIndex
- The spatial index for this GeoDataFrame.
+ The spatial index.
Examples
--------
- >>> from shapely.geometry import Point
- >>> from sedona.geopandas import GeoDataFrame
+ >>> from shapely.geometry import Point, box
+ >>> from sedona.geopandas import GeoSeries
>>>
- >>> gdf = GeoDataFrame([{"geometry": Point(1, 1), "value": 1},
- ... {"geometry": Point(2, 2), "value": 2}])
- >>> index = gdf.sindex
- >>> index.size
- 2
+ >>> s = GeoSeries([Point(x, x) for x in range(5)])
+ >>> s.sindex.query(box(1, 1, 3, 3))
+ [Point(1, 1), Point(2, 2), Point(3, 3)]
+ >>> s.has_sindex
+ True
+ """
+ return _delegate_to_geometry_column("sindex", self)
+
+ @property
+ def has_sindex(self):
+ """Check the existence of the spatial index without generating it.
+
+ Use the `.sindex` attribute on a GeoDataFrame or GeoSeries
+ to generate a spatial index if it does not yet exist,
+ which may take considerable time based on the underlying index
+ implementation.
+
+ Note that the underlying spatial index may not be fully
+ initialized until the first use.
+
+ Currently, sindex is not retained when calling this method from a
GeoDataFrame.
+ You can workaround this by first extracting the active geometry column
as a GeoSeries,
+ and calling this method.
+
+ Examples
+ --------
+ >>> from shapely.geometry import Point
+ >>> s = GeoSeries([Point(x, x) for x in range(5)])
+ >>> s.has_sindex
+ False
+ >>> index = s.sindex
+ >>> s.has_sindex
+ True
+
+ Returns
+ -------
+ bool
+ `True` if the spatial index has been generated or
+ `False` if not.
"""
- # We pass in self.geometry here to use the active geometry column for
dataframe
- return _delegate_to_geometry_column("sindex", self.geometry)
+ return _delegate_to_geometry_column("has_sindex", self)
@abstractmethod
def copy(self: GeoFrameLike) -> GeoFrameLike:
@@ -2300,6 +2328,7 @@ class GeoFrame(metaclass=ABCMeta):
def _delegate_to_geometry_column(op, this, *args, **kwargs):
geom_column = this.geometry
+ inplace = kwargs.pop("inplace", False)
if args or kwargs:
data = getattr(geom_column, op)(*args, **kwargs)
else:
@@ -2308,7 +2337,8 @@ def _delegate_to_geometry_column(op, this, *args,
**kwargs):
if callable(data):
data = data()
- if kwargs.get("inplace", False):
+ if inplace:
+ # This assumes this is a GeoSeries
this._update_inplace(geom_column)
return None
diff --git a/python/sedona/geopandas/geodataframe.py
b/python/sedona/geopandas/geodataframe.py
index fa4b1bc825..2d35b78dde 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -29,20 +29,15 @@ import pandas as pd
import pyspark.pandas as pspd
import sedona.geopandas as sgpd
from pyspark.pandas import Series as PandasOnSparkSeries
-from pyspark.pandas._typing import Dtype
from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
-from pyspark.pandas.internal import InternalFrame
from pyspark.pandas.utils import log_advice
from sedona.geopandas._typing import Label
from sedona.geopandas.base import GeoFrame
-from sedona.geopandas.sindex import SpatialIndex
from pandas.api.extensions import register_extension_dtype
from geopandas.geodataframe import crs_mismatch_error
from geopandas.array import GeometryDtype
-from shapely.geometry.base import BaseGeometry
-from pyspark.pandas.internal import SPARK_DEFAULT_INDEX_NAME,
NATURAL_ORDER_COLUMN_NAME
register_extension_dtype(GeometryDtype)
@@ -666,6 +661,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
frame._geometry_column_name = geo_column_name
if new_series:
+ # Note: This casts GeoSeries back into pspd.Series, so we lose any
metadata that's not serialized
frame[geo_column_name] = level
if not inplace:
@@ -794,21 +790,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
"""
return pspd.DataFrame(self._internal)
- @property
- def sindex(self) -> SpatialIndex | None:
- """
- Returns a spatial index for the GeoDataFrame.
- The spatial index allows for efficient spatial queries. If the spatial
- index cannot be created (e.g., no geometry column is present), this
- property will return None.
- Returns:
- - SpatialIndex: The spatial index for the GeoDataFrame.
- - None: If the spatial index is not supported.
- """
- if "geometry" in self.columns:
- return SpatialIndex(self._internal.spark_frame,
column_name="geometry")
- return None
-
def copy(self, deep=False):
"""
Make a copy of this GeoDataFrame object.
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index 5f2b8dcb19..855f5e60bb 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -352,6 +352,7 @@ class GeoSeries(GeoFrame, pspd.Series):
self._anchor: GeoDataFrame
self._col_label: Label
+ self._sindex: SpatialIndex = None
if isinstance(
data, (GeoDataFrame, GeoSeries, PandasOnSparkSeries,
PandasOnSparkDataFrame)
@@ -626,7 +627,7 @@ class GeoSeries(GeoFrame, pspd.Series):
result = self._query_geometry_column(spark_col, keep_name=True)
if inplace:
- self._update_inplace(result)
+ self._update_inplace(result, invalidate_sindex=False)
return None
return result
@@ -750,29 +751,18 @@ class GeoSeries(GeoFrame, pspd.Series):
@property
def sindex(self) -> SpatialIndex:
- """
- Returns a spatial index built from the geometries.
-
- Returns
- -------
- SpatialIndex
- The spatial index for this GeoDataFrame.
-
- Examples
- --------
- >>> from shapely.geometry import Point
- >>> from sedona.geopandas import GeoDataFrame
- >>>
- >>> gdf = GeoDataFrame([{"geometry": Point(1, 1), "value": 1},
- ... {"geometry": Point(2, 2), "value": 2}])
- >>> index = gdf.sindex
- >>> index.size
- 2
- """
geometry_column = _get_series_col_name(self)
if geometry_column is None:
raise ValueError("No geometry column found in GeoSeries")
- return SpatialIndex(self._internal.spark_frame,
column_name=geometry_column)
+ if self._sindex is None:
+ self._sindex = SpatialIndex(
+ self._internal.spark_frame, column_name=geometry_column
+ )
+ return self._sindex
+
+ @property
+ def has_sindex(self):
+ return self._sindex is not None
def copy(self, deep=False):
"""Make a copy of this GeoSeries object.
@@ -2917,9 +2907,11 @@ e": "Feature", "properties": {}, "geometry": {"type":
"Point", "coordinates": [3
# # Utils
#
-----------------------------------------------------------------------------
- def _update_inplace(self, result: "GeoSeries"):
+ def _update_inplace(self, result: "GeoSeries", invalidate_sindex: bool =
True):
self.rename(result.name, inplace=True)
self._update_anchor(result._anchor)
+ if invalidate_sindex:
+ self._sindex = None
def _make_series_of_val(self, value: Any):
"""
diff --git a/python/sedona/geopandas/sindex.py
b/python/sedona/geopandas/sindex.py
index 638344ac36..8e5e515670 100644
--- a/python/sedona/geopandas/sindex.py
+++ b/python/sedona/geopandas/sindex.py
@@ -117,8 +117,10 @@ class SpatialIndex:
self._indexed_rdd, geometry, True, True
)
- results = result_rdd.collect()
- return results
+ geo_data_list = result_rdd.collect()
+ # No need to keep the userData field, so convert it directly to a
list of geometries
+ geoms_list = [row.geom for row in geo_data_list]
+ return geoms_list
else:
# For local spatial index based on Shapely STRtree
if predicate == "contains":
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 5f9b439874..625df0e8e5 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -113,6 +113,48 @@ class TestGeoSeries(TestGeopandasBase):
ps_df = ps.Series(data, index=index)
assert_series_equal(result.to_pandas(), ps_df.to_pandas())
+ def test_sindex(self):
+ s = GeoSeries([Point(x, x) for x in range(5)])
+ assert not s.has_sindex
+
+ result = s.sindex.query(box(1, 1, 3, 3))
+ expected = [Point(1, 1), Point(2, 2), Point(3, 3)]
+ assert result == expected
+ assert s.has_sindex
+
+ result = s.sindex.query(box(1, 1, 3, 3), predicate="contains")
+ expected = [Point(1, 1), Point(2, 2), Point(3, 3)]
+ assert result == expected
+ assert s.has_sindex
+
+ # Check that it works with a GeoDataFrame
+ gdf = s.to_geoframe()
+ result = gdf.sindex.query(box(1, 1, 3, 3), predicate="contains")
+ assert result == expected
+
+ # This is challenging to support due to gdf.__setitem__ casting
GeoSeries into pspd.Series
+ # assert gdf.has_sindex
+
+ def test_invalidate_sindex(self):
+ geoseries = GeoSeries([Point(0, 0), None, Point(2, 2)])
+
+ line = LineString([(1, 1), (3, 3)])
+ result1 = geoseries.sindex.query(line)
+ assert len(result1) == 1
+ assert geoseries.has_sindex
+
+ # Fill the None element with a new geometry that intersects with the
line
+ # This should invalidate the sindex
+ geoseries.fillna(Point(1, 1), inplace=True)
+ assert not geoseries.has_sindex
+
+ result = geoseries.sindex.query(line)
+ assert len(result) == 2
+
+ # For set_crs, no need to invalidate the sindex
+ geoseries.set_crs(4326, inplace=True)
+ assert geoseries.has_sindex
+
def test_plot(self):
# Just make sure it doesn't error
self.geoseries.plot()