(sedona) branch master updated: [GH-2055] Geopandas.GeoSeries: Implement estimate_utm_crs, bounds, total_bounds (#2056)

jiayu Tue, 08 Jul 2025 18:32:28 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new 3f407e96f4 [GH-2055] Geopandas.GeoSeries: Implement estimate_utm_crs, 
bounds, total_bounds (#2056)
3f407e96f4 is described below

commit 3f407e96f4edf05bacc2a135811c063524cc50e4
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Jul 8 18:32:17 2025 -0700

    [GH-2055] Geopandas.GeoSeries: Implement estimate_utm_crs, bounds, 
total_bounds (#2056)
    
    * Implement bounds, total_bounds, estimate_utm_crs
    
    * Compute minx, miny, maxx, maxy as a dataframe directly
---
 python/sedona/geopandas/geoseries.py               | 210 ++++++++++++++++++++-
 python/tests/geopandas/test_geoseries.py           |  68 ++++++-
 .../tests/geopandas/test_match_geopandas_series.py |  22 ++-
 3 files changed, 293 insertions(+), 7 deletions(-)

diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index de5927c618..2e4dc6bb2a 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -51,8 +51,7 @@ class GeoSeries(GeoFrame, pspd.Series):
     """
 
     def __getitem__(self, key: Any) -> Any:
-        # Implementation of the abstract method
-        raise NotImplementedError("This method is not implemented yet.")
+        return pspd.Series.__getitem__(self, key)
 
     def __repr__(self) -> str:
         """
@@ -2022,10 +2021,211 @@ class GeoSeries(GeoFrame, pspd.Series):
             "",
         )
 
-    def estimate_utm_crs(self, datum_name: str = "WGS 84"):
-        raise NotImplementedError(
-            "GeoSeries.estimate_utm_crs() is not implemented yet."
+    @property
+    def bounds(self) -> pspd.DataFrame:
+        """Returns a ``DataFrame`` with columns ``minx``, ``miny``, ``maxx``,
+        ``maxy`` values containing the bounds for each geometry.
+
+        See ``GeoSeries.total_bounds`` for the limits of the entire series.
+
+        Examples
+        --------
+        >>> from shapely.geometry import Point, Polygon, LineString
+        >>> d = {'geometry': [Point(2, 1), Polygon([(0, 0), (1, 1), (1, 0)]),
+        ... LineString([(0, 1), (1, 2)])]}
+        >>> gdf = geopandas.GeoDataFrame(d, crs="EPSG:4326")
+        >>> gdf.bounds
+           minx  miny  maxx  maxy
+        0   2.0   1.0   2.0   1.0
+        1   0.0   0.0   1.0   1.0
+        2   0.0   1.0   1.0   2.0
+
+        You can assign the bounds to the ``GeoDataFrame`` as:
+
+        >>> import pandas as pd
+        >>> gdf = pd.concat([gdf, gdf.bounds], axis=1)
+        >>> gdf
+                                geometry  minx  miny  maxx  maxy
+        0                     POINT (2 1)   2.0   1.0   2.0   1.0
+        1  POLYGON ((0 0, 1 1, 1 0, 0 0))   0.0   0.0   1.0   1.0
+        2           LINESTRING (0 1, 1 2)   0.0   1.0   1.0   2.0
+        """
+        col = self.get_first_geometry_column()
+
+        selects = [
+            f"ST_XMin(`{col}`) as minx",
+            f"ST_YMin(`{col}`) as miny",
+            f"ST_XMax(`{col}`) as maxx",
+            f"ST_YMax(`{col}`) as maxy",
+        ]
+
+        df = self._internal.spark_frame
+
+        data_type = df.schema[col].dataType
+
+        if isinstance(data_type, BinaryType):
+            selects = [
+                select.replace(f"`{col}`", f"ST_GeomFromWKB(`{col}`)")
+                for select in selects
+            ]
+
+        sdf = df.selectExpr(*selects)
+        internal = InternalFrame(
+            spark_frame=sdf,
+            index_spark_columns=None,
+            column_labels=[("minx",), ("miny",), ("maxx",), ("maxy",)],
+            data_spark_columns=[
+                scol_for(sdf, "minx"),
+                scol_for(sdf, "miny"),
+                scol_for(sdf, "maxx"),
+                scol_for(sdf, "maxy"),
+            ],
+            column_label_names=None,
         )
+        return pspd.DataFrame(internal)
+
+    @property
+    def total_bounds(self):
+        """Returns a tuple containing ``minx``, ``miny``, ``maxx``, ``maxy``
+        values for the bounds of the series as a whole.
+
+        See ``GeoSeries.bounds`` for the bounds of the geometries contained in
+        the series.
+
+        Examples
+        --------
+        >>> from shapely.geometry import Point, Polygon, LineString
+        >>> d = {'geometry': [Point(3, -1), Polygon([(0, 0), (1, 1), (1, 0)]),
+        ... LineString([(0, 1), (1, 2)])]}
+        >>> gdf = geopandas.GeoDataFrame(d, crs="EPSG:4326")
+        >>> gdf.total_bounds
+        array([ 0., -1.,  3.,  2.])
+        """
+        import numpy as np
+        import warnings
+        from pyspark.sql import functions as F
+
+        if len(self) == 0:
+            # numpy 'min' cannot handle empty arrays
+            # TODO with numpy >= 1.15, the 'initial' argument can be used
+            return np.array([np.nan, np.nan, np.nan, np.nan])
+        ps_df = self.bounds
+        with warnings.catch_warnings():
+            # if all rows are empty geometry / none, nan is expected
+            warnings.filterwarnings(
+                "ignore", r"All-NaN slice encountered", RuntimeWarning
+            )
+            total_bounds_df = ps_df.agg(
+                {
+                    "minx": ["min"],
+                    "miny": ["min"],
+                    "maxx": ["max"],
+                    "maxy": ["max"],
+                }
+            )
+
+            return np.array(
+                (
+                    np.nanmin(total_bounds_df["minx"]["min"]),  # minx
+                    np.nanmin(total_bounds_df["miny"]["min"]),  # miny
+                    np.nanmax(total_bounds_df["maxx"]["max"]),  # maxx
+                    np.nanmax(total_bounds_df["maxy"]["max"]),  # maxy
+                )
+            )
+
+    def estimate_utm_crs(self, datum_name: str = "WGS 84") -> "CRS":
+        """Returns the estimated UTM CRS based on the bounds of the dataset.
+
+        .. versionadded:: 0.9
+
+        .. note:: Requires pyproj 3+
+
+        Parameters
+        ----------
+        datum_name : str, optional
+            The name of the datum to use in the query. Default is WGS 84.
+
+        Returns
+        -------
+        pyproj.CRS
+
+        Examples
+        --------
+        >>> import geodatasets
+        >>> df = geopandas.read_file(
+        ...     geodatasets.get_path("geoda.chicago_commpop")
+        ... )
+        >>> df.geometry.values.estimate_utm_crs()  # doctest: +SKIP
+        <Derived Projected CRS: EPSG:32616>
+        Name: WGS 84 / UTM zone 16N
+        Axis Info [cartesian]:
+        - E[east]: Easting (metre)
+        - N[north]: Northing (metre)
+        Area of Use:
+        - name: Between 90°W and 84°W, northern hemisphere between equator and 
84°N,...
+        - bounds: (-90.0, 0.0, -84.0, 84.0)
+        Coordinate Operation:
+        - name: UTM zone 16N
+        - method: Transverse Mercator
+        Datum: World Geodetic System 1984 ensemble
+        - Ellipsoid: WGS 84
+        - Prime Meridian: Greenwich
+        """
+        import numpy as np
+        from pyproj import CRS
+        from pyproj.aoi import AreaOfInterest
+        from pyproj.database import query_utm_crs_info
+
+        # This implementation replicates the implementation in geopandas's 
implementation exactly.
+        # https://github.com/geopandas/geopandas/blob/main/geopandas/array.py
+        # The only difference is that we use Sedona's total_bounds property 
which is more efficient and scalable
+        # than the geopandas implementation. The rest of the implementation 
always executes on 4 points (minx, miny, maxx, maxy),
+        # so the numpy and pyproj implementations are reasonable.
+
+        if not self.crs:
+            raise RuntimeError("crs must be set to estimate UTM CRS.")
+
+        minx, miny, maxx, maxy = self.total_bounds
+        if self.crs.is_geographic:
+            x_center = np.mean([minx, maxx])
+            y_center = np.mean([miny, maxy])
+        # ensure using geographic coordinates
+        else:
+            from pyproj import Transformer
+            from functools import lru_cache
+
+            TransformerFromCRS = lru_cache(Transformer.from_crs)
+
+            transformer = TransformerFromCRS(self.crs, "EPSG:4326", 
always_xy=True)
+            minx, miny, maxx, maxy = transformer.transform_bounds(
+                minx, miny, maxx, maxy
+            )
+            y_center = np.mean([miny, maxy])
+            # crossed the antimeridian
+            if minx > maxx:
+                # shift maxx from [-180,180] to [0,360]
+                # so both numbers are positive for center calculation
+                # Example: -175 to 185
+                maxx += 360
+                x_center = np.mean([minx, maxx])
+                # shift back to [-180,180]
+                x_center = ((x_center + 180) % 360) - 180
+            else:
+                x_center = np.mean([minx, maxx])
+
+        utm_crs_list = query_utm_crs_info(
+            datum_name=datum_name,
+            area_of_interest=AreaOfInterest(
+                west_lon_degree=x_center,
+                south_lat_degree=y_center,
+                east_lon_degree=x_center,
+                north_lat_degree=y_center,
+            ),
+        )
+        try:
+            return CRS.from_epsg(utm_crs_list[0].code)
+        except IndexError:
+            raise RuntimeError("Unable to determine UTM CRS")
 
     def to_json(
         self,
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index e9caad3904..0f89d547ab 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -211,8 +211,74 @@ class TestGeoSeries(TestBase):
         )
         self.check_sgpd_equals_gpd(result, expected)
 
+    def test_bounds(self):
+        d = [
+            Point(2, 1),
+            Polygon([(0, 0), (1, 1), (1, 0)]),
+            LineString([(0, 1), (1, 2)]),
+            None,
+        ]
+        geoseries = sgpd.GeoSeries(d, crs="EPSG:4326")
+        result = geoseries.bounds
+
+        expected = pd.DataFrame(
+            {
+                "minx": [2.0, 0.0, 0.0, np.nan],
+                "miny": [1.0, 0.0, 1.0, np.nan],
+                "maxx": [2.0, 1.0, 1.0, np.nan],
+                "maxy": [1.0, 1.0, 2.0, np.nan],
+            }
+        )
+        pd.testing.assert_frame_equal(result.to_pandas(), expected)
+
+    def test_total_bounds(self):
+        d = [
+            Point(3, -1),
+            Polygon([(0, 0), (1, 1), (1, 0)]),
+            LineString([(0, 1), (1, 2)]),
+            None,
+        ]
+        geoseries = sgpd.GeoSeries(d, crs="EPSG:4326")
+        result = geoseries.total_bounds
+        expected = np.array([0.0, -1.0, 3.0, 2.0])
+        np.testing.assert_array_equal(result, expected)
+
+    # These tests were taken directly from the TestEstimateUtmCrs class in the 
geopandas test suite
+    # 
https://github.com/geopandas/geopandas/blob/main/geopandas/tests/test_array.py
     def test_estimate_utm_crs(self):
-        pass
+        from pyproj import CRS
+
+        # setup
+        esb = Point(-73.9847, 40.7484)
+        sol = Point(-74.0446, 40.6893)
+        landmarks = sgpd.GeoSeries([esb, sol], crs="epsg:4326")
+
+        # geographic
+        assert landmarks.estimate_utm_crs() == CRS("EPSG:32618")
+        assert landmarks.estimate_utm_crs("NAD83") == CRS("EPSG:26918")
+
+        # projected
+        assert landmarks.to_crs("EPSG:3857").estimate_utm_crs() == 
CRS("EPSG:32618")
+
+        # antimeridian
+        antimeridian = sgpd.GeoSeries(
+            [
+                Point(1722483.900174921, 5228058.6143420935),
+                Point(4624385.494808555, 8692574.544944234),
+            ],
+            crs="EPSG:3851",
+        )
+        assert antimeridian.estimate_utm_crs() == CRS("EPSG:32760")
+
+        # out of bounds
+        with pytest.raises(RuntimeError, match="Unable to determine UTM CRS"):
+            sgpd.GeoSeries(
+                [Polygon([(0, 90), (1, 90), (2, 90)])], crs="EPSG:4326"
+            ).estimate_utm_crs()
+
+        # missing crs
+        with pytest.raises(RuntimeError, match="crs must be set"):
+            sgpd.GeoSeries([Polygon([(0, 90), (1, 90), (2, 
90)])]).estimate_utm_crs()
 
     def test_to_json(self):
         pass
diff --git a/python/tests/geopandas/test_match_geopandas_series.py 
b/python/tests/geopandas/test_match_geopandas_series.py
index 3c159a7df9..e63d086c83 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -322,8 +322,28 @@ class TestMatchGeopandasSeries(TestBase):
             gpd_result = gpd_result.to_crs(epsg=3857)
             self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
 
+    def test_bounds(self):
+        for _, geom in self.geoms:
+            sgpd_result = GeoSeries(geom).bounds
+            gpd_result = gpd.GeoSeries(geom).bounds
+            pd.testing.assert_frame_equal(
+                sgpd_result.to_pandas(), pd.DataFrame(gpd_result)
+            )
+
+    def test_total_bounds(self):
+        import numpy as np
+
+        for _, geom in self.geoms:
+            sgpd_result = GeoSeries(geom).total_bounds
+            gpd_result = gpd.GeoSeries(geom).total_bounds
+            np.testing.assert_array_equal(sgpd_result, gpd_result)
+
     def test_estimate_utm_crs(self):
-        pass
+        for crs in ["epsg:4326", "epsg:3857"]:
+            for _, geom in self.geoms:
+                gpd_result = gpd.GeoSeries(geom, crs=crs).estimate_utm_crs()
+                sgpd_result = GeoSeries(geom, crs=crs).estimate_utm_crs()
+                assert sgpd_result == gpd_result
 
     def test_to_json(self):
         pass

(sedona) branch master updated: [GH-2055] Geopandas.GeoSeries: Implement estimate_utm_crs, bounds, total_bounds (#2056)

Reply via email to