This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 5cf9e607c2 [GH-2070] Geopandas.GeoSeries: Implement `get_geometry`
(#2071)
5cf9e607c2 is described below
commit 5cf9e607c2819152863195f8bce383cf547bb5b9
Author: Peter Nguyen <[email protected]>
AuthorDate: Fri Jul 11 23:22:12 2025 -0700
[GH-2070] Geopandas.GeoSeries: Implement `get_geometry` (#2071)
* Implement get_geometry
* Skip test for old versions
* Support extra-negative indices
* Skip test for shapely < 2.0.0
* Update docs to mention the shapely < 2.0.0 behavior
---
python/sedona/geopandas/geoseries.py | 122 ++++++++++++++++++++-
python/tests/geopandas/test_geoseries.py | 45 +++++++-
.../tests/geopandas/test_match_geopandas_series.py | 18 ++-
3 files changed, 177 insertions(+), 8 deletions(-)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index 06602b27ce..97e35038b3 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -935,9 +935,98 @@ class GeoSeries(GeoFrame, pspd.Series):
# Implementation of the abstract method
raise NotImplementedError("This method is not implemented yet.")
- def get_geometry(self, index):
- # Implementation of the abstract method
- raise NotImplementedError("This method is not implemented yet.")
+ def get_geometry(self, index) -> "GeoSeries":
+ """Returns the n-th geometry from a collection of geometries
(0-indexed).
+
+ If the index is non-negative, it returns the geometry at that index.
+ If the index is negative, it counts backward from the end of the
collection (e.g., -1 returns the last geometry).
+ Returns None if the index is out of bounds.
+
+ Note: Simple geometries act as length-1 collections
+
+ Note: Using Shapely < 2.0, may lead to different results for empty
simple geometries due to how
+ shapely interprets them.
+
+ Parameters
+ ----------
+ index : int or array_like
+ Position of a geometry to be retrieved within its collection
+
+ Returns
+ -------
+ GeoSeries
+
+ Notes
+ -----
+ Simple geometries act as collections of length 1. Any out-of-range
index value
+ returns None.
+
+ Examples
+ --------
+ >>> from shapely.geometry import Point, MultiPoint, GeometryCollection
+ >>> s = geopandas.GeoSeries(
+ ... [
+ ... Point(0, 0),
+ ... MultiPoint([(0, 0), (1, 1), (0, 1), (1, 0)]),
+ ... GeometryCollection(
+ ... [MultiPoint([(0, 0), (1, 1), (0, 1), (1, 0)]),
Point(0, 1)]
+ ... ),
+ ... Polygon(),
+ ... GeometryCollection(),
+ ... ]
+ ... )
+ >>> s
+ 0 POINT (0 0)
+ 1 MULTIPOINT ((0 0), (1 1), (0 1), (1 0))
+ 2 GEOMETRYCOLLECTION (MULTIPOINT ((0 0), (1 1), ...
+ 3 POLYGON EMPTY
+ 4 GEOMETRYCOLLECTION EMPTY
+ dtype: geometry
+
+ >>> s.get_geometry(0)
+ 0 POINT (0 0)
+ 1 POINT (0 0)
+ 2 MULTIPOINT ((0 0), (1 1), (0 1), (1 0))
+ 3 POLYGON EMPTY
+ 4 None
+ dtype: geometry
+
+ >>> s.get_geometry(1)
+ 0 None
+ 1 POINT (1 1)
+ 2 POINT (0 1)
+ 3 None
+ 4 None
+ dtype: geometry
+
+ >>> s.get_geometry(-1)
+ 0 POINT (0 0)
+ 1 POINT (1 0)
+ 2 POINT (0 1)
+ 3 POLYGON EMPTY
+ 4 None
+ dtype: geometry
+
+ """
+
+ # Sedona errors on negative indexes, so we use a case statement to
handle it ourselves
+ select = """
+ ST_GeometryN(
+ `L`,
+ CASE
+ WHEN ST_NumGeometries(`L`) + `R` < 0 THEN NULL
+ WHEN `R` < 0 THEN ST_NumGeometries(`L`) + `R`
+ ELSE `R`
+ END
+ )
+ """
+
+ return self._row_wise_operation(
+ select,
+ index,
+ align=False,
+ rename="get_geometry",
+ )
@property
def boundary(self):
@@ -1353,7 +1442,7 @@ class GeoSeries(GeoFrame, pspd.Series):
def _row_wise_operation(
self,
select: str,
- other: Union["GeoSeries", BaseGeometry],
+ other: Any,
align: Union[bool, None],
rename: str,
returns_geom: bool = True,
@@ -1372,7 +1461,11 @@ class GeoSeries(GeoFrame, pspd.Series):
if isinstance(other, BaseGeometry):
other = GeoSeries([other] * len(self))
- assert isinstance(other, GeoSeries), f"Invalid type for other:
{type(other)}"
+ # e.g int input
+ if not isinstance(other, pspd.Series):
+ other = pspd.Series([other] * len(self))
+
+ assert isinstance(other, pspd.Series), f"Invalid type for other:
{type(other)}"
# This code assumes there is only one index (SPARK_DEFAULT_INDEX_NAME)
# and would need to be updated if Sedona later supports multi-index
@@ -1385,7 +1478,7 @@ class GeoSeries(GeoFrame, pspd.Series):
col(SPARK_DEFAULT_INDEX_NAME),
)
other_df = other._internal.spark_frame.select(
- col(other.get_first_geometry_column()).alias("R"),
+ col(_get_first_column_name(other)).alias("R"),
# for the right side, we only need the column that we are joining
on
col(index_col),
)
@@ -2368,6 +2461,23 @@ class GeoSeries(GeoFrame, pspd.Series):
# -----------------------------------------------------------------------------
+def _get_first_column_name(series: pspd.Series) -> str:
+ """
+ Get the first column name of a Series.
+
+ Parameters:
+ - series: The input Series.
+
+ Returns:
+ - str: The first column name of the Series.
+ """
+ return next(
+ field.name
+ for field in series._internal.spark_frame.schema.fields
+ if field.name not in (SPARK_DEFAULT_INDEX_NAME,
NATURAL_ORDER_COLUMN_NAME)
+ )
+
+
def _to_spark_pandas_df(ps_series: pspd.Series) -> pspd.DataFrame:
return pspd.DataFrame(ps_series._psdf._internal)
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 6aabf33f25..298b7b13ac 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import shapely
import numpy as np
import pytest
import pandas as pd
@@ -446,7 +447,49 @@ class TestGeoSeries(TestBase):
pass
def test_get_geometry(self):
- pass
+ # Shapely 1 seems to have a bug where Polygon() is incorrectly
interpreted as a GeometryCollection
+ if shapely.__version__ < "2.0.0":
+ return
+
+ from shapely.geometry import MultiPoint
+
+ s = GeoSeries(
+ [
+ Point(0, 0),
+ MultiPoint([(0, 0), (1, 1), (0, 1), (1, 0)]),
+ GeometryCollection(
+ [MultiPoint([(0, 0), (1, 1), (0, 1), (1, 0)]), Point(0, 1)]
+ ),
+ Polygon(),
+ GeometryCollection(),
+ ]
+ )
+
+ result = s.get_geometry(0)
+ expected = gpd.GeoSeries(
+ [
+ Point(0, 0),
+ Point(0, 0),
+ MultiPoint([(0, 0), (1, 1), (0, 1), (1, 0)]),
+ Polygon(),
+ None,
+ ]
+ )
+ self.check_sgpd_equals_gpd(result, expected)
+
+ result = s.get_geometry(1)
+ expected = gpd.GeoSeries([None, Point(1, 1), Point(0, 1), None, None])
+ self.check_sgpd_equals_gpd(result, expected)
+
+ result = s.get_geometry(-1)
+ expected = gpd.GeoSeries(
+ [Point(0, 0), Point(1, 0), Point(0, 1), Polygon(), None]
+ )
+ self.check_sgpd_equals_gpd(result, expected)
+
+ result = s.get_geometry(2)
+ expected = gpd.GeoSeries([None, Point(0, 1), None, None, None])
+ self.check_sgpd_equals_gpd(result, expected)
def test_boundary(self):
pass
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
index cbb61527c5..389597c651 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -38,6 +38,7 @@ from shapely.geometry import (
from sedona.geopandas import GeoSeries
from tests.test_base import TestBase
import pyspark.pandas as ps
+from packaging.version import parse as parse_version
class TestMatchGeopandasSeries(TestBase):
@@ -460,7 +461,22 @@ class TestMatchGeopandasSeries(TestBase):
pass
def test_get_geometry(self):
- pass
+ if parse_version(gpd.__version__) < parse_version("1.0.0"):
+ return
+
+ for _, geom in self.geoms:
+ # test negative index, in-bounds index, and out of bounds index
+ for index in [-1, 0, len(geom) + 1]:
+ sgpd_result = GeoSeries(geom).get_geometry(index)
+ gpd_result = gpd.GeoSeries(geom).get_geometry(index)
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
+ data = [GeometryCollection(), Polygon(), MultiPolygon()]
+
+ for idx in [-2, -1, 0, 1]:
+ sgpd_result = GeoSeries(data).get_geometry(idx)
+ gpd_result = gpd.GeoSeries(data).get_geometry(idx)
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
def test_boundary(self):
pass