(sedona) branch master updated: [GH-2020] Geopandas.GeoSeries: Implement .length and refactor process_geometry_column() (#2021)

jiayu Tue, 01 Jul 2025 14:52:51 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new 30cc3b14ea [GH-2020] Geopandas.GeoSeries: Implement .length and 
refactor process_geometry_column() (#2021)
30cc3b14ea is described below

commit 30cc3b14ea494c2717d502e83ea26841da73aa1c
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Jul 1 14:52:41 2025 -0700

    [GH-2020] Geopandas.GeoSeries: Implement .length and refactor 
process_geometry_column() (#2021)
    
    * Refactor process_geometry_column to create a more flexible 
query_geometry_column()
    
    * Implement length()
---
 python/sedona/geopandas/geoseries.py               | 129 +++++++++++++++------
 python/tests/geopandas/test_geoseries.py           |  19 ++-
 .../tests/geopandas/test_match_geopandas_series.py |   6 +-
 3 files changed, 116 insertions(+), 38 deletions(-)

diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index 066cacb75a..4dc612d171 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -340,6 +340,7 @@ class GeoSeries(GeoFrame, pspd.Series):
     ) -> "GeoSeries":
         """
         Helper method to process a single geometry column with a specified 
operation.
+        This method wraps the _query_geometry_column method for simpler 
convenient use.
 
         Parameters
         ----------
@@ -360,45 +361,66 @@ class GeoSeries(GeoFrame, pspd.Series):
         # Find the first column with BinaryType or GeometryType
         first_col = self.get_first_geometry_column()  # TODO: fixme
 
-        if first_col:
-            data_type = self._internal.spark_frame.schema[first_col].dataType
+        # Handle both positional and keyword arguments
+        all_args = list(args)
+        for k, v in kwargs.items():
+            all_args.append(v)
 
-            # Handle both positional and keyword arguments
-            all_args = list(args)
-            for k, v in kwargs.items():
-                all_args.append(v)
+        # Join all arguments as comma-separated values
+        params = ""
+        if all_args:
+            params_list = [
+                str(arg) if isinstance(arg, (int, float)) else repr(arg)
+                for arg in all_args
+            ]
+            params = f", {', '.join(params_list)}"
 
-            # Join all arguments as comma-separated values
-            params = ""
-            if all_args:
-                params_list = [
-                    str(arg) if isinstance(arg, (int, float)) else repr(arg)
-                    for arg in all_args
-                ]
-                params = f", {', '.join(params_list)}"
+        sql_expr = f"{operation}(`{first_col}`{params})"
 
-            rename = first_col if not rename else rename
+        return self._query_geometry_column(sql_expr, first_col, rename)
 
-            if isinstance(data_type, BinaryType):
-                sql_expr = (
-                    f"{operation}(ST_GeomFromWKB(`{first_col}`){params}) as 
`{rename}`"
-                )
-            else:
-                sql_expr = f"{operation}(`{first_col}`{params}) as `{rename}`"
-
-            sdf = self._internal.spark_frame.selectExpr(sql_expr)
-            internal = InternalFrame(
-                spark_frame=sdf,
-                index_spark_columns=None,
-                column_labels=[self._column_label],
-                data_spark_columns=[scol_for(sdf, rename)],
-                data_fields=[self._internal.data_fields[0]],
-                column_label_names=self._internal.column_label_names,
-            )
-            return 
_to_geo_series(first_series(PandasOnSparkDataFrame(internal)))
-        else:
+    def _query_geometry_column(
+        self, query: str, col: Union[str, None], rename: str
+    ) -> "GeoSeries":
+        """
+        Helper method to query a single geometry column with a specified 
operation.
+
+        Parameters
+        ----------
+        query : str
+            The query to apply to the geometry column.
+        col : str
+            The name of the column to query.
+        rename : str
+            The name of the resulting column.
+
+        Returns
+        -------
+        GeoSeries
+            A GeoSeries with the operation applied to the geometry column.
+        """
+        if not col:
             raise ValueError("No valid geometry column found.")
 
+        data_type = self._internal.spark_frame.schema[col].dataType
+
+        if isinstance(data_type, BinaryType):
+            # the backticks here are important so we don't match strings that 
happen to be the same as the column name
+            query = query.replace(f"`{col}`", f"ST_GeomFromWKB(`{col}`)")
+
+        sql_expr = f"{query} as `{rename}`"
+
+        sdf = self._internal.spark_frame.selectExpr(sql_expr)
+        internal = InternalFrame(
+            spark_frame=sdf,
+            index_spark_columns=None,
+            column_labels=[self._column_label],
+            data_spark_columns=[scol_for(sdf, rename)],
+            data_fields=[self._internal.data_fields[0]],
+            column_label_names=self._internal.column_label_names,
+        )
+        return _to_geo_series(first_series(PandasOnSparkDataFrame(internal)))
+
     @property
     def dtypes(self) -> Union[gpd.GeoSeries, pd.Series, Dtype]:
         # Implementation of the abstract method
@@ -511,9 +533,44 @@ class GeoSeries(GeoFrame, pspd.Series):
         raise NotImplementedError("This method is not implemented yet.")
 
     @property
-    def length(self):
-        # Implementation of the abstract method
-        raise NotImplementedError("This method is not implemented yet.")
+    def length(self) -> pspd.Series:
+        """
+        Returns a Series containing the length of each geometry in the 
GeoSeries.
+
+        In the case of a (Multi)Polygon it measures the length of its exterior 
(i.e. perimeter).
+
+        For a GeometryCollection it measures sums the values for each of the 
individual geometries.
+
+        Returns
+        -------
+        Series
+            A Series containing the length of each geometry.
+
+        Examples
+        --------
+        >>> from shapely.geometry import Polygon
+        >>> import geopandas as gpd
+        >>> from sedona.geopandas import GeoSeries
+
+        >>> gs = GeoSeries([Point(0, 0), LineString([(0, 0), (1, 1)]), 
Polygon([(0, 0), (1, 0), (1, 1)]), GeometryCollection([Point(0, 0), 
LineString([(0, 0), (1, 1)]), Polygon([(0, 0), (1, 0), (1, 1)])])])
+        >>> gs.length
+        0    0.000000
+        1    1.414214
+        2    3.414214
+        3    4.828427
+        dtype: float64
+        """
+        col = self.get_first_geometry_column()
+        select = f"""
+            CASE
+                WHEN GeometryType(`{col}`) IN ('LINESTRING', 
'MULTILINESTRING') THEN ST_Length(`{col}`)
+                WHEN GeometryType(`{col}`) IN ('POLYGON', 'MULTIPOLYGON') THEN 
ST_Perimeter(`{col}`)
+                WHEN GeometryType(`{col}`) IN ('POINT', 'MULTIPOINT') THEN 0.0
+                WHEN GeometryType(`{col}`) IN ('GEOMETRYCOLLECTION') THEN 
ST_Length(`{col}`) + ST_Perimeter(`{col}`)
+            END"""
+        return self._query_geometry_column(
+            select, col, rename="length"
+        ).to_spark_pandas()
 
     @property
     def is_valid(self):
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index 3b8020c513..0c13e92673 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -19,6 +19,7 @@ import pytest
 import pandas as pd
 import geopandas as gpd
 import sedona.geopandas as sgpd
+from sedona.geopandas import GeoSeries
 from tests.test_base import TestBase
 from shapely import wkt
 from shapely.geometry import Point, LineString, Polygon, GeometryCollection
@@ -154,7 +155,23 @@ class TestGeoSeries(TestBase):
         pass
 
     def test_length(self):
-        pass
+        geoseries = GeoSeries(
+            [
+                Point(0, 0),
+                LineString([(0, 0), (1, 1)]),
+                Polygon([(0, 0), (1, 0), (1, 1)]),
+                GeometryCollection(
+                    [
+                        Point(0, 0),
+                        LineString([(0, 0), (1, 1)]),
+                        Polygon([(0, 0), (1, 0), (1, 1)]),
+                    ]
+                ),
+            ]
+        )
+        result = geoseries.length.to_pandas()
+        expected = pd.Series([0.000000, 1.414214, 3.414214, 4.828427])
+        assert_series_equal(result, expected)
 
     def test_is_valid(self):
         pass
diff --git a/python/tests/geopandas/test_match_geopandas_series.py 
b/python/tests/geopandas/test_match_geopandas_series.py
index 1f6943624a..fee2fa565d 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -291,7 +291,11 @@ class TestMatchGeopandasSeries(TestBase):
         pass
 
     def test_length(self):
-        pass
+        for _, geom in self.geoms:
+            sgpd_result = GeoSeries(geom).length
+            assert isinstance(sgpd_result, ps.Series)
+            gpd_result = gpd.GeoSeries(geom).length
+            self.check_pd_series_equal(sgpd_result, gpd_result)
 
     def test_is_valid(self):
         pass

(sedona) branch master updated: [GH-2020] Geopandas.GeoSeries: Implement .length and refactor process_geometry_column() (#2021)

Reply via email to