This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 406e35212e [GH-2050] Geopandas.GeoSeries: Implement align=False for
_row_wise_operation + support indexes (#2057)
406e35212e is described below
commit 406e35212ea903f59aab2271b49e2808bc966898
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Jul 3 21:48:44 2025 -0700
[GH-2050] Geopandas.GeoSeries: Implement align=False for
_row_wise_operation + support indexes (#2057)
---
python/sedona/geopandas/geoseries.py | 138 +++++++++++++--------
python/tests/geopandas/test_geoseries.py | 68 +++++++++-
.../tests/geopandas/test_match_geopandas_series.py | 30 ++++-
3 files changed, 183 insertions(+), 53 deletions(-)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index b3bbdb418a..805b827fb8 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -39,7 +39,10 @@ from sedona.geopandas.base import GeoFrame
from sedona.geopandas.geodataframe import GeoDataFrame
from sedona.geopandas.geoindex import GeoIndex
-from pyspark.pandas.internal import SPARK_DEFAULT_INDEX_NAME #
__index_level_0__
+from pyspark.pandas.internal import (
+ SPARK_DEFAULT_INDEX_NAME, # __index_level_0__
+ NATURAL_ORDER_COLUMN_NAME,
+)
class GeoSeries(GeoFrame, pspd.Series):
@@ -169,14 +172,7 @@ class GeoSeries(GeoFrame, pspd.Series):
gs.apply(lambda geom: geom.wkb if geom is not None else None)
)
# initialize the parent class pyspark Series with the pandas Series
- super().__init__(
- data=pdf,
- index=index,
- dtype=dtype,
- name=name,
- copy=copy,
- fastpath=fastpath,
- )
+ super().__init__(data=pdf)
if crs:
self.set_crs(crs, inplace=True)
@@ -950,7 +946,7 @@ class GeoSeries(GeoFrame, pspd.Series):
An object is said to intersect `other` if its `boundary` and `interior`
intersects in any way with those of the other.
- The operation works on a 1-to-1 row-wise manner:
+ The operation works on a 1-to-1 row-wise manner.
Parameters
----------
@@ -959,7 +955,7 @@ class GeoSeries(GeoFrame, pspd.Series):
intersected.
align : bool | None (default None)
If True, automatically aligns GeoSeries based on their indices.
None defaults to True.
- If False, the order of elements is preserved. (not supported in
Sedona Geopandas)
+ If False, the order of elements is preserved.
Returns
-------
@@ -982,23 +978,26 @@ class GeoSeries(GeoFrame, pspd.Series):
... LineString([(1, 0), (1, 3)]),
... LineString([(2, 0), (0, 2)]),
... Point(1, 1),
- ... Point(-100, -100),
+ ... Point(0, 1),
... ],
... index=range(1, 5),
... )
- We can check two GeoSeries against each other, row by row.
- The GeoSeries above have different indices. We align both GeoSeries
- based on index values and compare elements with the same index:
+ >>> s
+ 0 POLYGON ((0 0, 2 2, 0 2, 0 0))
+ 1 LINESTRING (0 0, 2 2)
+ 2 LINESTRING (2 0, 0 2)
+ 3 POINT (0 1)
+ dtype: geometry
- >>> s.intersects(s2)
- 0 True
- 1 True
- 2 True
- 3 False
- dtype: bool
+ >>> s2
+ 1 LINESTRING (1 0, 1 3)
+ 2 LINESTRING (2 0, 0 2)
+ 3 POINT (1 1)
+ 4 POINT (0 1)
+ dtype: geometry
- We can also check if each geometry of GeoSeries intersects a single
+ We can check if each geometry of GeoSeries crosses a single
geometry:
>>> line = LineString([(-1, 1), (3, 1)])
@@ -1009,6 +1008,27 @@ class GeoSeries(GeoFrame, pspd.Series):
3 True
dtype: bool
+ We can also check two GeoSeries against each other, row by row.
+ The GeoSeries above have different indices. We can either align both
GeoSeries
+ based on index values and compare elements with the same index using
+ ``align=True`` or ignore index and compare elements based on their
matching
+ order using ``align=False``:
+
+ >>> s.intersects(s2, align=True)
+ 0 False
+ 1 True
+ 2 True
+ 3 False
+ 4 False
+ dtype: bool
+
+ >>> s.intersects(s2, align=False)
+ 0 True
+ 1 True
+ 2 True
+ 3 True
+ dtype: bool
+
Notes
-----
This method works in a row-wise manner. It does not check if an element
@@ -1035,7 +1055,7 @@ class GeoSeries(GeoFrame, pspd.Series):
"""Returns a ``GeoSeries`` of the intersection of points in each
aligned geometry with `other`.
- The operation works on a 1-to-1 row-wise manner:
+ The operation works on a 1-to-1 row-wise manner.
Parameters
----------
@@ -1044,7 +1064,7 @@ class GeoSeries(GeoFrame, pspd.Series):
intersection with.
align : bool | None (default None)
If True, automatically aligns GeoSeries based on their indices.
None defaults to True.
- If False, the order of elements is preserved. (not supported in
Sedona Geopandas)
+ If False, the order of elements is preserved.
Returns
-------
@@ -1069,43 +1089,62 @@ class GeoSeries(GeoFrame, pspd.Series):
... LineString([(1, 0), (1, 3)]),
... LineString([(2, 0), (0, 2)]),
... Point(1, 1),
- ... Point(-100, -100),
+ ... Point(0, 1),
... ],
+ ... index=range(1, 6),
... )
- We can do an intersection of each geometry and a single
- shapely geometry:
+ >>> s
+ 0 POLYGON ((0 0, 2 2, 0 2, 0 0))
+ 1 POLYGON ((0 0, 2 2, 0 2, 0 0))
+ 2 LINESTRING (0 0, 2 2)
+ 3 LINESTRING (2 0, 0 2)
+ 4 POINT (0 1)
+ dtype: geometry
- >>> geom = Polygon([(-0.5, -0.5), (-0.5, 2.5), (2.5, 2.5), (2.5,
-0.5), (-0.5, -0.5)])
- >>> s.intersection(geom)
- Polygon([(0, 0), (2, 2), (0, 2)]),
- Polygon([(0, 0), (2, 2), (0, 2)]),
- LineString([(0, 0), (2, 2)]),
- LineString([(2, 0), (0, 2)]),
- Point(0, 1),
+ >>> s2
+ 1 POLYGON ((0 0, 1 1, 0 1, 0 0))
+ 2 LINESTRING (1 0, 1 3)
+ 3 LINESTRING (2 0, 0 2)
+ 4 POINT (1 1)
+ 5 POINT (0 1)
dtype: geometry
- >>> geom = Polygon([(-0.5, -0.5), (-0.5, 2.5), (2.5, 2.5), (2.5,
-0.5), (-0.5, -0.5)])
+ We can also do intersection of each geometry and a single
+ shapely geometry:
+
>>> s.intersection(Polygon([(0, 0), (1, 1), (0, 1)]))
- 0 POLYGON ((0 0, 2 2, 0 2))
- 1 POLYGON ((0 0, 2 2, 0 2))
- 2 LINESTRING (0 0, 2 2)
- 3 LINESTRING (2 0, 0 2)
+ 0 POLYGON ((0 0, 0 1, 1 1, 0 0))
+ 1 POLYGON ((0 0, 0 1, 1 1, 0 0))
+ 2 LINESTRING (0 0, 1 1)
+ 3 POINT (1 1)
4 POINT (0 1)
dtype: geometry
We can also check two GeoSeries against each other, row by row.
- The GeoSeries above have different indices. We align both GeoSeries
- based on index values and compare elements with the same index.
+ The GeoSeries above have different indices. We can either align both
GeoSeries
+ based on index values and compare elements with the same index using
+ ``align=True`` or ignore index and compare elements based on their
matching
+ order using ``align=False``:
+
+ >>> s.intersection(s2, align=True)
+ 0 None
+ 1 POLYGON ((0 0, 0 1, 1 1, 0 0))
+ 2 POINT (1 1)
+ 3 LINESTRING (2 0, 0 2)
+ 4 POINT EMPTY
+ 5 None
+ dtype: geometry
- >>> s.intersection(s2)
- 0 POLYGON ((0 0, 1 1, 0 1, 0 0))
+ >>> s.intersection(s2, align=False)
+ 0 POLYGON ((0 0, 0 1, 1 1, 0 0))
1 LINESTRING (1 1, 1 2)
2 POINT (1 1)
3 POINT (1 1)
- 4 POLYGON EMPTY
+ 4 POINT (0 1)
dtype: geometry
+
See Also
--------
GeoSeries.difference
@@ -1130,8 +1169,9 @@ class GeoSeries(GeoFrame, pspd.Series):
from pyspark.sql.functions import col
# Note: this is specifically False. None is valid since it defaults to
True similar to geopandas
- if align is False:
- raise NotImplementedError("Sedona Geopandas does not support
align=False")
+ index_col = (
+ NATURAL_ORDER_COLUMN_NAME if align is False else
SPARK_DEFAULT_INDEX_NAME
+ )
if isinstance(other, BaseGeometry):
other = GeoSeries([other] * len(self))
@@ -1141,13 +1181,13 @@ class GeoSeries(GeoFrame, pspd.Series):
# TODO: this does not yet support multi-index
df = self._internal.spark_frame.select(
col(self.get_first_geometry_column()).alias("L"),
- col(SPARK_DEFAULT_INDEX_NAME),
+ col(index_col),
)
other_df = other._internal.spark_frame.select(
col(other.get_first_geometry_column()).alias("R"),
- col(SPARK_DEFAULT_INDEX_NAME),
+ col(index_col),
)
- joined_df = df.join(other_df, on=SPARK_DEFAULT_INDEX_NAME, how="outer")
+ joined_df = df.join(other_df, on=index_col, how="outer")
return self._query_geometry_column(
select,
cols=["L", "R"],
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 21b84ca7e5..6aabd495dc 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -59,7 +59,10 @@ class TestGeoSeries(TestBase):
assert len(actual) == len(expected)
sgpd_result = actual.to_geopandas()
for a, e in zip(sgpd_result, expected):
- if a.is_empty and e.is_empty:
+ if a is None or e is None:
+ assert a is None and e is None
+ continue
+ elif a.is_empty and e.is_empty:
continue
self.assert_geometry_almost_equal(a, e)
@@ -466,6 +469,23 @@ class TestGeoSeries(TestBase):
expected = pd.Series([True, True, True, True])
assert_series_equal(result.to_pandas(), expected)
+ # from the original doc string
+ s2 = sgpd.GeoSeries(
+ [
+ LineString([(1, 0), (1, 3)]),
+ LineString([(2, 0), (0, 2)]),
+ Point(1, 1),
+ Point(0, 1),
+ ],
+ index=range(1, 5),
+ )
+
+ result = s.intersects(s2, align=True)
+ expected = pd.Series([False, True, True, False, False])
+
+ result = s.intersects(s2, align=False)
+ expected = pd.Series([True, True, True, True])
+
def test_intersection(self):
s = sgpd.GeoSeries(
[
@@ -513,8 +533,50 @@ class TestGeoSeries(TestBase):
)
self.check_sgpd_equals_gpd(result, expected)
- with pytest.raises(NotImplementedError):
- s.intersection(s2, align=False)
+ # from the original doc string
+ s = sgpd.GeoSeries(
+ [
+ Polygon([(0, 0), (2, 2), (0, 2)]),
+ Polygon([(0, 0), (2, 2), (0, 2)]),
+ LineString([(0, 0), (2, 2)]),
+ LineString([(2, 0), (0, 2)]),
+ Point(0, 1),
+ ],
+ )
+ s2 = sgpd.GeoSeries(
+ [
+ Polygon([(0, 0), (1, 1), (0, 1)]),
+ LineString([(1, 0), (1, 3)]),
+ LineString([(2, 0), (0, 2)]),
+ Point(1, 1),
+ Point(0, 1),
+ ],
+ index=range(1, 6),
+ )
+ result = s.intersection(s2, align=True)
+ expected = gpd.GeoSeries(
+ [
+ None,
+ Polygon([(0, 0), (0, 1), (1, 1), (0, 0)]),
+ Point(1, 1),
+ LineString([(2, 0), (0, 2)]),
+ Point(),
+ None,
+ ]
+ )
+ self.check_sgpd_equals_gpd(result, expected)
+
+ result = s.intersection(s2, align=False)
+ expected = gpd.GeoSeries(
+ [
+ Polygon([(0, 0), (0, 1), (1, 1), (0, 0)]),
+ LineString([(1, 1), (1, 2)]),
+ Point(1, 1),
+ Point(1, 1),
+ Point(0, 1),
+ ]
+ )
+ self.check_sgpd_equals_gpd(result, expected)
def test_intersection_all(self):
pass
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
index 1990b87074..289da18401 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -496,6 +496,15 @@ class TestMatchGeopandasSeries(TestBase):
gpd_result =
gpd.GeoSeries(geom).intersects(gpd.GeoSeries(geom2))
self.check_pd_series_equal(sgpd_result, gpd_result)
+ if len(geom) == len(geom2):
+ sgpd_result = GeoSeries(geom).intersects(
+ GeoSeries(geom2), align=False
+ )
+ gpd_result = gpd.GeoSeries(geom).intersects(
+ gpd.GeoSeries(geom2), align=False
+ )
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
def test_intersection(self):
geometries = [
Polygon([(0, 0), (1, 0), (1, 1)]),
@@ -511,6 +520,22 @@ class TestMatchGeopandasSeries(TestBase):
gpd_result = gpd.GeoSeries(g1).intersection(gpd.GeoSeries(g2))
self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+ # Ensure both align True and False work correctly
+ for _, g1 in self.geoms:
+ for _, g2 in self.geoms:
+ gpd_series1, gpd_series2 = gpd.GeoSeries(g1), gpd.GeoSeries(g2)
+ # The original geopandas intersection method fails on invalid
geometries
+ if not gpd_series1.is_valid.all() or not
gpd_series2.is_valid.all():
+ continue
+ sgpd_result = GeoSeries(g1).intersection(GeoSeries(g2))
+ gpd_result = gpd_series1.intersection(gpd_series2)
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
+ if len(g1) == len(g2):
+ sgpd_result = GeoSeries(g1).intersects(GeoSeries(g2),
align=False)
+ gpd_result = gpd_series1.intersects(gpd_series2,
align=False)
+ self.check_pd_series_equal(sgpd_result, gpd_result)
+
def test_intersection_all(self):
pass
@@ -554,8 +579,11 @@ class TestMatchGeopandasSeries(TestBase):
assert isinstance(expected, gpd.GeoSeries)
sgpd_result = actual.to_geopandas()
for a, e in zip(sgpd_result, expected):
+ if a is None or e is None:
+ assert a is None and e is None
+ continue
# Sometimes sedona and geopandas both return empty geometries but
of different types (e.g Point and Polygon)
- if a.is_empty and e.is_empty:
+ elif a.is_empty and e.is_empty:
continue
self.assert_geometry_almost_equal(
a, e, tolerance=1e-2