This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 33c9a3d0cf [GH-2331] Geopandas: Document differences of sindex
compared to gpd + sindex fixes (#2332)
33c9a3d0cf is described below
commit 33c9a3d0cf7566981159714db7696cfb56a91b2b
Author: Peter Nguyen <[email protected]>
AuthorDate: Fri Sep 5 14:34:05 2025 -0700
[GH-2331] Geopandas: Document differences of sindex compared to gpd +
sindex fixes (#2332)
---
python/sedona/spark/geopandas/geoseries.py | 4 +-
python/sedona/spark/geopandas/sindex.py | 73 ++++++++++++++++++++----------
python/tests/geopandas/test_sindex.py | 55 ++++++++++++++++++++--
3 files changed, 100 insertions(+), 32 deletions(-)
diff --git a/python/sedona/spark/geopandas/geoseries.py
b/python/sedona/spark/geopandas/geoseries.py
index 77e6d349c4..b9fb417a8a 100644
--- a/python/sedona/spark/geopandas/geoseries.py
+++ b/python/sedona/spark/geopandas/geoseries.py
@@ -751,9 +751,7 @@ class GeoSeries(GeoFrame, pspd.Series):
if geometry_column is None:
raise ValueError("No geometry column found in GeoSeries")
if self._sindex is None:
- self._sindex = SpatialIndex(
- self._internal.spark_frame, column_name=geometry_column
- )
+ self._sindex = SpatialIndex(self)
return self._sindex
@property
diff --git a/python/sedona/spark/geopandas/sindex.py
b/python/sedona/spark/geopandas/sindex.py
index 1b4829351c..4629289ec9 100644
--- a/python/sedona/spark/geopandas/sindex.py
+++ b/python/sedona/spark/geopandas/sindex.py
@@ -38,12 +38,23 @@ class SpatialIndex:
Parameters
----------
- geometry : np.array of Shapely geometries, PySparkDataFrame column, or
PySparkDataFrame
+ geometry : np.array of Shapely geometries, GeoSeries, or
PySparkDataFrame
index_type : str, default "strtree"
The type of spatial index to use.
column_name : str, optional
The column name to extract geometry from if `geometry` is a
PySparkDataFrame.
+
+ Note: query methods (ie. query, nearest, intersection) have different
behaviors depending on how the index is constructed.
+ When constructed from a np.array, the query methods return indices
like original geopandas.
+ When constructed from a GeoSeries or PySparkDataFrame, the query
methods return geometries.
"""
+ from sedona.spark.geopandas import GeoSeries
+
+ if isinstance(geometry, GeoSeries):
+ from sedona.spark.geopandas.geoseries import _get_series_col_name
+
+ column_name = _get_series_col_name(geometry)
+ geometry = geometry._internal.spark_frame
if isinstance(geometry, np.ndarray):
self.geometry = geometry
@@ -65,7 +76,7 @@ class SpatialIndex:
self._build_spark_index(column_name)
else:
raise TypeError(
- "Invalid type for `geometry`. Expected np.array or
PySparkDataFrame."
+ "Invalid type for `geometry`. Expected np.array, GeoSeries, or
PySparkDataFrame."
)
def query(self, geometry: BaseGeometry, predicate: str = None, sort: bool
= False):
@@ -82,12 +93,18 @@ class SpatialIndex:
sort : bool, optional, default False
Whether to sort the results.
+ Note: query() has different behaviors depending on how the index is
constructed.
+ When constructed from a np.array, this method returns indices like
original geopandas.
+ When constructed from a GeoSeries or PySparkDataFrame, this method
returns geometries.
+
Note: Unlike Geopandas, Sedona does not support geometry input of type
np.array or GeoSeries.
+ It is recommended to instead use GeoSeries.intersects directly.
Returns
-------
list
- List of indices of matching geometries.
+ List of geometries if constructed from a GeoSeries or
PySparkDataFrame.
+ List of the corresponding indices if constructed from a np.array.
"""
if not isinstance(geometry, BaseGeometry):
@@ -96,7 +113,7 @@ class SpatialIndex:
)
log_advice(
- "`query` returns local list of indices of matching geometries onto
driver's memory. "
+ "`query` returns a local list onto driver's memory. "
"It should only be used if the resulting collection is expected to
be small."
)
@@ -170,10 +187,15 @@ class SpatialIndex:
Note: Unlike Geopandas, Sedona does not support geometry input of type
np.array or GeoSeries.
+ Note: nearest() has different behaviors depending on how the index is
constructed.
+ When constructed from a np.array, this method returns indices like
original geopandas.
+ When constructed from a GeoSeries or PySparkDataFrame, this method
returns geometries.
+
Returns
-------
list or tuple
- List of indices of nearest geometries, optionally with distances.
+ List of geometries if constructed from a GeoSeries or
PySparkDataFrame.
+ List of the corresponding indices if constructed from a np.array.
"""
if not isinstance(geometry, BaseGeometry):
@@ -194,15 +216,18 @@ class SpatialIndex:
from sedona.spark.core.spatialOperator import KNNQuery
# Execute the KNN query
- results = KNNQuery.SpatialKnnQuery(self._indexed_rdd, geometry, k,
False)
+ geo_data_list = KNNQuery.SpatialKnnQuery(
+ self._indexed_rdd, geometry, k, False
+ )
+
+ # No need to keep the userData field, so convert it directly to a
list of geometries
+ geoms_list = [row.geom for row in geo_data_list]
if return_distance:
# Calculate distances if requested
- distances = [
- geom.distance(geometry) for geom in [row.geom for row in
results]
- ]
- return results, distances
- return results
+ distances = [geom.distance(geometry) for geom in geoms_list]
+ return geoms_list, distances
+ return geoms_list
else:
# For local spatial index based on Shapely STRtree
if k > len(self.geometry):
@@ -220,20 +245,29 @@ class SpatialIndex:
def intersection(self, bounds):
"""
- Find geometries that intersect the given bounding box.
+ Find geometries that intersect the given bounding box. Similar to the
Geopandas version,
+ this is a compatibility wrapper for rtree.index.Index.intersection,
use query instead.
Parameters
----------
bounds : tuple
Bounding box as (min_x, min_y, max_x, max_y).
+ Note: intersection() has different behaviors depending on how the
index is constructed.
+ When constructed from a np.array, this method returns indices like
original geopandas.
+ When constructed from a GeoSeries or PySparkDataFrame, this method
returns geometries.
+
+ Note: Unlike Geopandas, Sedona does not support geometry input of type
np.array or GeoSeries.
+ It is recommended to instead use GeoSeries.intersects directly.
+
Returns
-------
list
- List of indices of matching geometries.
+ List of geometries if constructed from a GeoSeries or
PySparkDataFrame.
+ List of the corresponding indices if constructed from a np.array.
"""
log_advice(
- "`intersection` returns local list of indices of matching
geometries onto driver's memory. "
+ "`intersection` returns local list of matching geometries onto
driver's memory. "
"It should only be used if the resulting collection is expected to
be small."
)
@@ -246,16 +280,7 @@ class SpatialIndex:
bbox = box(*bounds)
if self._is_spark:
- # For Spark-based spatial index
- from sedona.spark.core.spatialOperator import RangeQuery
-
- # Execute the spatial range query with the bounding box
- result_rdd = RangeQuery.SpatialRangeQuery(
- self._indexed_rdd, bbox, True, True
- )
-
- results = result_rdd.collect()
- return results
+ return self.query(bbox, predicate="intersects")
else:
# For local spatial index based on Shapely STRtree
try:
diff --git a/python/tests/geopandas/test_sindex.py
b/python/tests/geopandas/test_sindex.py
index 83c308fdbb..6e68fc40b1 100644
--- a/python/tests/geopandas/test_sindex.py
+++ b/python/tests/geopandas/test_sindex.py
@@ -19,7 +19,7 @@ import pytest
import numpy as np
import shapely
from pyspark.sql.functions import expr
-from shapely.geometry import Point, Polygon, LineString
+from shapely.geometry import Point, Polygon, LineString, box
from tests.test_base import TestBase
from sedona.spark.geopandas import GeoSeries
@@ -63,6 +63,31 @@ class TestSpatialIndex(TestBase):
]
)
+ def test_construct_from_geoseries(self):
+ # Construct from a GeoSeries
+ gs = GeoSeries([Point(x, x) for x in range(5)])
+ sindex = SpatialIndex(gs)
+ result = sindex.query(Point(2, 2))
+ # SpatialIndex constructed from GeoSeries return geometries
+ assert result == [Point(2, 2)]
+
+ def test_construct_from_pyspark_dataframe(self):
+ # Construct from PySparkDataFrame
+ df = self.spark.createDataFrame(
+ [(Point(x, x),) for x in range(5)], ["geometry"]
+ )
+ sindex = SpatialIndex(df, column_name="geometry")
+ result = sindex.query(Point(2, 2))
+ assert result == [Point(2, 2)]
+
+ def test_construct_from_nparray(self):
+ # Construct from np.array
+ array = np.array([Point(x, x) for x in range(5)])
+ sindex = SpatialIndex(array)
+ result = sindex.query(Point(2, 2))
+ # Returns indices like original geopandas
+ assert result == np.array([2])
+
def test_geoseries_sindex_property_exists(self):
"""Test that the sindex property exists on GeoSeries."""
assert hasattr(self.points, "sindex")
@@ -182,7 +207,7 @@ class TestSpatialIndex(TestBase):
assert len(nearest_result) == 1
# The nearest point should have id=2 (POINT(1 1))
- assert nearest_result[0].geom.wkt == "POINT (1 1)"
+ assert nearest_result[0].wkt == "POINT (1 1)"
# Test finding k=2 nearest neighbors
nearest_2_results = spark_sindex.nearest(query_point, k=2)
@@ -219,7 +244,7 @@ class TestSpatialIndex(TestBase):
# Should find polygon containing the point
assert len(nearest_geom) == 1
- assert "POLYGON" in nearest_geom[0].geom.wkt
+ assert "POLYGON" in nearest_geom[0].wkt
# Test with linestring query
query_line = LineString([(1.5, 1.5), (2.5, 2.5)])
@@ -343,7 +368,12 @@ class TestSpatialIndex(TestBase):
result_rows = spark_sindex.intersection(bounds)
# Verify correct results are returned
- assert len(result_rows) >= 2
+ expected = [
+ Polygon([(1, 1), (2, 1), (2, 2), (1, 2), (1, 1)]),
+ Polygon([(2, 2), (3, 2), (3, 3), (2, 3), (2, 2)]),
+ Polygon([(3, 3), (4, 3), (4, 4), (3, 4), (3, 3)]),
+ ]
+ assert result_rows == expected
# Test with bounds that don't intersect any geometry
empty_bounds = (10, 10, 11, 11)
@@ -353,7 +383,14 @@ class TestSpatialIndex(TestBase):
# Test with bounds that cover all geometries
full_bounds = (-1, -1, 6, 6)
full_results = spark_sindex.intersection(full_bounds)
- assert len(full_results) == 5 # Should match all 5 polygons
+ expected = [
+ Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]),
+ Polygon([(1, 1), (2, 1), (2, 2), (1, 2), (1, 1)]),
+ Polygon([(2, 2), (3, 2), (3, 3), (2, 3), (2, 2)]),
+ Polygon([(3, 3), (4, 3), (4, 4), (3, 4), (3, 3)]),
+ Polygon([(4, 4), (5, 4), (5, 5), (4, 5), (4, 4)]),
+ ]
+ assert full_results == expected
def test_intersection_with_points(self):
"""Test the intersection method with point geometries."""
@@ -426,3 +463,11 @@ class TestSpatialIndex(TestBase):
# Verify results
assert len(results) == 2
+
+ # test from the geopandas docstring
+ def test_geoseries_sindex_intersection(self):
+ gs = GeoSeries([Point(x, x) for x in range(10)])
+ result = gs.sindex.intersection(box(1, 1, 3, 3).bounds)
+ # Unlike original geopandas, this returns geometries instead of indices
+ expected = [Point(1, 1), Point(2, 2), Point(3, 3)]
+ assert result == expected