This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 3f407e96f4 [GH-2055] Geopandas.GeoSeries: Implement estimate_utm_crs,
bounds, total_bounds (#2056)
3f407e96f4 is described below
commit 3f407e96f4edf05bacc2a135811c063524cc50e4
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Jul 8 18:32:17 2025 -0700
[GH-2055] Geopandas.GeoSeries: Implement estimate_utm_crs, bounds,
total_bounds (#2056)
* Implement bounds, total_bounds, estimate_utm_crs
* Compute minx, miny, maxx, maxy as a dataframe directly
---
python/sedona/geopandas/geoseries.py | 210 ++++++++++++++++++++-
python/tests/geopandas/test_geoseries.py | 68 ++++++-
.../tests/geopandas/test_match_geopandas_series.py | 22 ++-
3 files changed, 293 insertions(+), 7 deletions(-)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index de5927c618..2e4dc6bb2a 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -51,8 +51,7 @@ class GeoSeries(GeoFrame, pspd.Series):
"""
def __getitem__(self, key: Any) -> Any:
- # Implementation of the abstract method
- raise NotImplementedError("This method is not implemented yet.")
+ return pspd.Series.__getitem__(self, key)
def __repr__(self) -> str:
"""
@@ -2022,10 +2021,211 @@ class GeoSeries(GeoFrame, pspd.Series):
"",
)
- def estimate_utm_crs(self, datum_name: str = "WGS 84"):
- raise NotImplementedError(
- "GeoSeries.estimate_utm_crs() is not implemented yet."
+ @property
+ def bounds(self) -> pspd.DataFrame:
+ """Returns a ``DataFrame`` with columns ``minx``, ``miny``, ``maxx``,
+ ``maxy`` values containing the bounds for each geometry.
+
+ See ``GeoSeries.total_bounds`` for the limits of the entire series.
+
+ Examples
+ --------
+ >>> from shapely.geometry import Point, Polygon, LineString
+ >>> d = {'geometry': [Point(2, 1), Polygon([(0, 0), (1, 1), (1, 0)]),
+ ... LineString([(0, 1), (1, 2)])]}
+ >>> gdf = geopandas.GeoDataFrame(d, crs="EPSG:4326")
+ >>> gdf.bounds
+ minx miny maxx maxy
+ 0 2.0 1.0 2.0 1.0
+ 1 0.0 0.0 1.0 1.0
+ 2 0.0 1.0 1.0 2.0
+
+ You can assign the bounds to the ``GeoDataFrame`` as:
+
+ >>> import pandas as pd
+ >>> gdf = pd.concat([gdf, gdf.bounds], axis=1)
+ >>> gdf
+ geometry minx miny maxx maxy
+ 0 POINT (2 1) 2.0 1.0 2.0 1.0
+ 1 POLYGON ((0 0, 1 1, 1 0, 0 0)) 0.0 0.0 1.0 1.0
+ 2 LINESTRING (0 1, 1 2) 0.0 1.0 1.0 2.0
+ """
+ col = self.get_first_geometry_column()
+
+ selects = [
+ f"ST_XMin(`{col}`) as minx",
+ f"ST_YMin(`{col}`) as miny",
+ f"ST_XMax(`{col}`) as maxx",
+ f"ST_YMax(`{col}`) as maxy",
+ ]
+
+ df = self._internal.spark_frame
+
+ data_type = df.schema[col].dataType
+
+ if isinstance(data_type, BinaryType):
+ selects = [
+ select.replace(f"`{col}`", f"ST_GeomFromWKB(`{col}`)")
+ for select in selects
+ ]
+
+ sdf = df.selectExpr(*selects)
+ internal = InternalFrame(
+ spark_frame=sdf,
+ index_spark_columns=None,
+ column_labels=[("minx",), ("miny",), ("maxx",), ("maxy",)],
+ data_spark_columns=[
+ scol_for(sdf, "minx"),
+ scol_for(sdf, "miny"),
+ scol_for(sdf, "maxx"),
+ scol_for(sdf, "maxy"),
+ ],
+ column_label_names=None,
)
+ return pspd.DataFrame(internal)
+
+ @property
+ def total_bounds(self):
+ """Returns a tuple containing ``minx``, ``miny``, ``maxx``, ``maxy``
+ values for the bounds of the series as a whole.
+
+ See ``GeoSeries.bounds`` for the bounds of the geometries contained in
+ the series.
+
+ Examples
+ --------
+ >>> from shapely.geometry import Point, Polygon, LineString
+ >>> d = {'geometry': [Point(3, -1), Polygon([(0, 0), (1, 1), (1, 0)]),
+ ... LineString([(0, 1), (1, 2)])]}
+ >>> gdf = geopandas.GeoDataFrame(d, crs="EPSG:4326")
+ >>> gdf.total_bounds
+ array([ 0., -1., 3., 2.])
+ """
+ import numpy as np
+ import warnings
+ from pyspark.sql import functions as F
+
+ if len(self) == 0:
+ # numpy 'min' cannot handle empty arrays
+ # TODO with numpy >= 1.15, the 'initial' argument can be used
+ return np.array([np.nan, np.nan, np.nan, np.nan])
+ ps_df = self.bounds
+ with warnings.catch_warnings():
+ # if all rows are empty geometry / none, nan is expected
+ warnings.filterwarnings(
+ "ignore", r"All-NaN slice encountered", RuntimeWarning
+ )
+ total_bounds_df = ps_df.agg(
+ {
+ "minx": ["min"],
+ "miny": ["min"],
+ "maxx": ["max"],
+ "maxy": ["max"],
+ }
+ )
+
+ return np.array(
+ (
+ np.nanmin(total_bounds_df["minx"]["min"]), # minx
+ np.nanmin(total_bounds_df["miny"]["min"]), # miny
+ np.nanmax(total_bounds_df["maxx"]["max"]), # maxx
+ np.nanmax(total_bounds_df["maxy"]["max"]), # maxy
+ )
+ )
+
+ def estimate_utm_crs(self, datum_name: str = "WGS 84") -> "CRS":
+ """Returns the estimated UTM CRS based on the bounds of the dataset.
+
+ .. versionadded:: 0.9
+
+ .. note:: Requires pyproj 3+
+
+ Parameters
+ ----------
+ datum_name : str, optional
+ The name of the datum to use in the query. Default is WGS 84.
+
+ Returns
+ -------
+ pyproj.CRS
+
+ Examples
+ --------
+ >>> import geodatasets
+ >>> df = geopandas.read_file(
+ ... geodatasets.get_path("geoda.chicago_commpop")
+ ... )
+ >>> df.geometry.values.estimate_utm_crs() # doctest: +SKIP
+ <Derived Projected CRS: EPSG:32616>
+ Name: WGS 84 / UTM zone 16N
+ Axis Info [cartesian]:
+ - E[east]: Easting (metre)
+ - N[north]: Northing (metre)
+ Area of Use:
+ - name: Between 90°W and 84°W, northern hemisphere between equator and
84°N,...
+ - bounds: (-90.0, 0.0, -84.0, 84.0)
+ Coordinate Operation:
+ - name: UTM zone 16N
+ - method: Transverse Mercator
+ Datum: World Geodetic System 1984 ensemble
+ - Ellipsoid: WGS 84
+ - Prime Meridian: Greenwich
+ """
+ import numpy as np
+ from pyproj import CRS
+ from pyproj.aoi import AreaOfInterest
+ from pyproj.database import query_utm_crs_info
+
+ # This implementation replicates the implementation in geopandas's
implementation exactly.
+ # https://github.com/geopandas/geopandas/blob/main/geopandas/array.py
+ # The only difference is that we use Sedona's total_bounds property
which is more efficient and scalable
+ # than the geopandas implementation. The rest of the implementation
always executes on 4 points (minx, miny, maxx, maxy),
+ # so the numpy and pyproj implementations are reasonable.
+
+ if not self.crs:
+ raise RuntimeError("crs must be set to estimate UTM CRS.")
+
+ minx, miny, maxx, maxy = self.total_bounds
+ if self.crs.is_geographic:
+ x_center = np.mean([minx, maxx])
+ y_center = np.mean([miny, maxy])
+ # ensure using geographic coordinates
+ else:
+ from pyproj import Transformer
+ from functools import lru_cache
+
+ TransformerFromCRS = lru_cache(Transformer.from_crs)
+
+ transformer = TransformerFromCRS(self.crs, "EPSG:4326",
always_xy=True)
+ minx, miny, maxx, maxy = transformer.transform_bounds(
+ minx, miny, maxx, maxy
+ )
+ y_center = np.mean([miny, maxy])
+ # crossed the antimeridian
+ if minx > maxx:
+ # shift maxx from [-180,180] to [0,360]
+ # so both numbers are positive for center calculation
+ # Example: -175 to 185
+ maxx += 360
+ x_center = np.mean([minx, maxx])
+ # shift back to [-180,180]
+ x_center = ((x_center + 180) % 360) - 180
+ else:
+ x_center = np.mean([minx, maxx])
+
+ utm_crs_list = query_utm_crs_info(
+ datum_name=datum_name,
+ area_of_interest=AreaOfInterest(
+ west_lon_degree=x_center,
+ south_lat_degree=y_center,
+ east_lon_degree=x_center,
+ north_lat_degree=y_center,
+ ),
+ )
+ try:
+ return CRS.from_epsg(utm_crs_list[0].code)
+ except IndexError:
+ raise RuntimeError("Unable to determine UTM CRS")
def to_json(
self,
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index e9caad3904..0f89d547ab 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -211,8 +211,74 @@ class TestGeoSeries(TestBase):
)
self.check_sgpd_equals_gpd(result, expected)
+ def test_bounds(self):
+ d = [
+ Point(2, 1),
+ Polygon([(0, 0), (1, 1), (1, 0)]),
+ LineString([(0, 1), (1, 2)]),
+ None,
+ ]
+ geoseries = sgpd.GeoSeries(d, crs="EPSG:4326")
+ result = geoseries.bounds
+
+ expected = pd.DataFrame(
+ {
+ "minx": [2.0, 0.0, 0.0, np.nan],
+ "miny": [1.0, 0.0, 1.0, np.nan],
+ "maxx": [2.0, 1.0, 1.0, np.nan],
+ "maxy": [1.0, 1.0, 2.0, np.nan],
+ }
+ )
+ pd.testing.assert_frame_equal(result.to_pandas(), expected)
+
+ def test_total_bounds(self):
+ d = [
+ Point(3, -1),
+ Polygon([(0, 0), (1, 1), (1, 0)]),
+ LineString([(0, 1), (1, 2)]),
+ None,
+ ]
+ geoseries = sgpd.GeoSeries(d, crs="EPSG:4326")
+ result = geoseries.total_bounds
+ expected = np.array([0.0, -1.0, 3.0, 2.0])
+ np.testing.assert_array_equal(result, expected)
+
+ # These tests were taken directly from the TestEstimateUtmCrs class in the
geopandas test suite
+ #
https://github.com/geopandas/geopandas/blob/main/geopandas/tests/test_array.py
def test_estimate_utm_crs(self):
- pass
+ from pyproj import CRS
+
+ # setup
+ esb = Point(-73.9847, 40.7484)
+ sol = Point(-74.0446, 40.6893)
+ landmarks = sgpd.GeoSeries([esb, sol], crs="epsg:4326")
+
+ # geographic
+ assert landmarks.estimate_utm_crs() == CRS("EPSG:32618")
+ assert landmarks.estimate_utm_crs("NAD83") == CRS("EPSG:26918")
+
+ # projected
+ assert landmarks.to_crs("EPSG:3857").estimate_utm_crs() ==
CRS("EPSG:32618")
+
+ # antimeridian
+ antimeridian = sgpd.GeoSeries(
+ [
+ Point(1722483.900174921, 5228058.6143420935),
+ Point(4624385.494808555, 8692574.544944234),
+ ],
+ crs="EPSG:3851",
+ )
+ assert antimeridian.estimate_utm_crs() == CRS("EPSG:32760")
+
+ # out of bounds
+ with pytest.raises(RuntimeError, match="Unable to determine UTM CRS"):
+ sgpd.GeoSeries(
+ [Polygon([(0, 90), (1, 90), (2, 90)])], crs="EPSG:4326"
+ ).estimate_utm_crs()
+
+ # missing crs
+ with pytest.raises(RuntimeError, match="crs must be set"):
+ sgpd.GeoSeries([Polygon([(0, 90), (1, 90), (2,
90)])]).estimate_utm_crs()
def test_to_json(self):
pass
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
index 3c159a7df9..e63d086c83 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -322,8 +322,28 @@ class TestMatchGeopandasSeries(TestBase):
gpd_result = gpd_result.to_crs(epsg=3857)
self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+ def test_bounds(self):
+ for _, geom in self.geoms:
+ sgpd_result = GeoSeries(geom).bounds
+ gpd_result = gpd.GeoSeries(geom).bounds
+ pd.testing.assert_frame_equal(
+ sgpd_result.to_pandas(), pd.DataFrame(gpd_result)
+ )
+
+ def test_total_bounds(self):
+ import numpy as np
+
+ for _, geom in self.geoms:
+ sgpd_result = GeoSeries(geom).total_bounds
+ gpd_result = gpd.GeoSeries(geom).total_bounds
+ np.testing.assert_array_equal(sgpd_result, gpd_result)
+
def test_estimate_utm_crs(self):
- pass
+ for crs in ["epsg:4326", "epsg:3857"]:
+ for _, geom in self.geoms:
+ gpd_result = gpd.GeoSeries(geom, crs=crs).estimate_utm_crs()
+ sgpd_result = GeoSeries(geom, crs=crs).estimate_utm_crs()
+ assert sgpd_result == gpd_result
def test_to_json(self):
pass