(sedona) branch master updated: [GH-2201] Geopandas: Optimize crs operations, getitem, and set_geometry + impl scalable getitem (#2203)

jiayu Thu, 31 Jul 2025 08:34:41 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new e87eaa9875 [GH-2201] Geopandas: Optimize crs operations, getitem, and 
set_geometry + impl scalable getitem (#2203)
e87eaa9875 is described below

commit e87eaa9875c07cf00a55319913bf02b45cfdbd6f
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Jul 31 08:34:29 2025 -0700

    [GH-2201] Geopandas: Optimize crs operations, getitem, and set_geometry + 
impl scalable getitem (#2203)
    
    * Improvements to geodataframe's getitem, constructor, and set_geometry + 
add to_spark_pandas function
    
    * Add tests
    
    * Clean up
    
    * Implement crs, set_crs for GeoDataFrame correctly
    
    * Make Geoseries.set_crs lazy
    
    * Make geodataframe getitem and set_geometry lazy
    
    * Optimize geoseries get crs by using first_value to pre-filter
    
    * Set allow_override=True default in geodataframe too
    
    * Clarify comment
    
    * Turn np.nan to 0 in when getting geoseries crs
    
    * Apply suggestions from code review
    
    Co-authored-by: Copilot <[email protected]>
    
    * Implement and test to_crs for geodataframe
    
    * empty commit (stac tests flaked twice in a row)
    
    ---------
    
    Co-authored-by: Copilot <[email protected]>
---
 python/sedona/geopandas/geodataframe.py     | 316 ++++++++++++++++++++--------
 python/sedona/geopandas/geoseries.py        |  80 ++++---
 python/tests/geopandas/test_geodataframe.py | 120 +++++++++++
 python/tests/geopandas/test_geoseries.py    |  22 +-
 4 files changed, 418 insertions(+), 120 deletions(-)

diff --git a/python/sedona/geopandas/geodataframe.py 
b/python/sedona/geopandas/geodataframe.py
index b2c3060c04..fa4b1bc825 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -309,55 +309,22 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         Name: value, dtype: int64
         """
 
-        # Handle column access by name
-        if isinstance(key, str):
-            # Access column directly from the spark DataFrame
-            column_name = key
-
-            # Check if column exists
-            if column_name not in self.columns:
-                raise KeyError(f"Column '{column_name}' does not exist")
-
-            # Here we are getting a ps.Series with the same underlying anchor 
(ps.Dataframe).
-            # This is important so we don't unnecessarily try to perform 
operations on different dataframes
-            ps_series: pspd.Series = pspd.DataFrame.__getitem__(self, 
column_name)
-
+        # Here we are getting a ps.Series with the same underlying anchor 
(ps.Dataframe).
+        # This is important so we don't unnecessarily try to perform 
operations on different dataframes
+        item = pspd.DataFrame.__getitem__(self, key)
+
+        if isinstance(item, pspd.DataFrame):
+            # don't specify crs=self.crs here because it might not include the 
geometry column
+            # if it does include the geometry column, we don't need to set crs 
anyways
+            return GeoDataFrame(item)
+        elif isinstance(item, pspd.Series):
+            ps_series: pspd.Series = item
             try:
-                result = sgpd.GeoSeries(ps_series)
-                not_null = ps_series[ps_series.notnull()]
-                if len(not_null) > 0:
-                    first_geom = not_null.iloc[0]
-                    srid = shapely.get_srid(first_geom)
-
-                    # Shapely objects stored in the ps.Series retain their srid
-                    # but the GeoSeries does not, so we manually re-set it here
-                    if srid > 0:
-                        result.set_crs(srid, inplace=True)
-                return result
+                return sgpd.GeoSeries(ps_series)
             except TypeError:
                 return ps_series
-
-        # Handle list of column names
-        elif isinstance(key, list) and all(isinstance(k, str) for k in key):
-            # Check if all columns exist
-            missing_cols = [k for k in key if k not in self.columns]
-            if missing_cols:
-                raise KeyError(f"Columns {missing_cols} do not exist")
-
-            # Select columns from the spark DataFrame
-            spark_df = self._internal.spark_frame.select(*key)
-            pandas_df = spark_df.toPandas()
-
-            # Return as GeoDataFrame
-            return GeoDataFrame(pandas_df)
-
-        # Handle row selection via slice or boolean indexing
         else:
-            # For now, convert to pandas first for row-based operations
-            # This could be optimized later for better performance
-            pandas_df = self._internal.spark_frame.toPandas()
-            selected_rows = pandas_df[key]
-            return GeoDataFrame(selected_rows)
+            raise Exception(f"Logical Error: Unexpected type: {type(item)}")
 
     _geometry_column_name = None
 
@@ -384,49 +351,45 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         from sedona.geopandas import GeoSeries
         from pyspark.sql import DataFrame as SparkDataFrame
 
-        if isinstance(data, GeoDataFrame):
-            data_crs = data._safe_get_crs()
-            if data_crs is not None:
-                data.crs = data_crs
-
-            super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
-
-        elif isinstance(data, GeoSeries):
-            if data.crs is None:
+        if isinstance(data, (GeoDataFrame, GeoSeries)):
+            if crs:
                 data.crs = crs
 
             # For each of these super().__init__() calls, we let pyspark 
decide which inputs are valid or not
             # instead of calling e.g assert not dtype ourselves.
             # This way, if Spark adds support later, than we inherit those 
changes naturally
             super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
+
         elif isinstance(data, (PandasOnSparkDataFrame, SparkDataFrame)):
 
             super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
         elif isinstance(data, PandasOnSparkSeries):
 
             try:
-                data = GeoSeries(data)
+                data = GeoSeries(data, crs=crs)
             except TypeError:
                 pass
 
             super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
         else:
             # below are not distributed dataframe types
-            if isinstance(data, pd.DataFrame):
-                assert index is None
-                assert dtype is None
-                assert not copy
-                # Need to convert GeoDataFrame to pd.DataFrame for below cast 
to work
-                pd_df = (
-                    pd.DataFrame(data) if isinstance(data, gpd.GeoDataFrame) 
else data
-                )
-            else:
-                pd_df = pd.DataFrame(
-                    data=data,
-                    index=index,
-                    dtype=dtype,
-                    copy=copy,
-                )
+            if isinstance(data, gpd.GeoDataFrame):
+                # We can use GeoDataFrame.active_geometry_name once we drop 
support for geopandas < 1.0.0
+                # Below is the equivalent, since active_geometry_name simply 
calls _geometry_column_name
+                if data._geometry_column_name:
+                    # Geopandas stores crs as metadata instead of inside of 
the shapely objects so we must save it and set it manually later
+                    if not crs:
+                        crs = data.crs
+                    if not geometry:
+                        geometry = data.geometry.name
+
+            pd_df = pd.DataFrame(
+                data,
+                index=index,
+                columns=columns,
+                dtype=dtype,
+                copy=copy,
+            )
 
             # Spark complains if it's left as a geometry type
             geom_type_cols = pd_df.select_dtypes(include=["geometry"]).columns
@@ -447,7 +410,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                 raise ValueError(crs_mismatch_error)
 
         if geometry:
-            self.set_geometry(geometry, inplace=True)
+            self.set_geometry(geometry, inplace=True, crs=crs)
 
         if geometry is None and "geometry" in self.columns:
 
@@ -462,8 +425,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                 geom_crs = geometry.crs
                 if geom_crs is None:
                     if crs is not None:
-                        geometry.set_crs(crs, inplace=True)
-                        self.set_geometry(geometry, inplace=True)
+                        self.set_geometry(geometry, inplace=True, crs=crs)
                 else:
                     if crs is not None and geom_crs != crs:
                         raise ValueError(crs_mismatch_error)
@@ -625,6 +587,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
             frame = self.copy(deep=False)
 
         geo_column_name = self._geometry_column_name
+        new_series = False
 
         if geo_column_name is None:
             geo_column_name = "geometry"
@@ -647,6 +610,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                     level = col.rename(geo_column_name)
             else:
                 level = pspd.Series(col, name=geo_column_name)
+
+            if not isinstance(level, sgpd.GeoSeries):
+                # Set the crs later, so we can allow_override=True
+                level = sgpd.GeoSeries(level)
+
+            new_series = True
         elif hasattr(col, "ndim") and col.ndim > 1:
             raise ValueError("Must pass array with one dimension only.")
         else:  # should be a colname
@@ -689,20 +658,15 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                 # if not dropping, set the active geometry name to the given 
col name
                 geo_column_name = col
 
-        if not crs:
-            crs = getattr(level, "crs", None)
-
-        # Check that we are using a listlike of geometries
-        level = _ensure_geometry(level, crs=crs)
-        # ensure_geometry only sets crs on level if it has crs==None
-
         # This operation throws a warning to the user asking them to set 
pspd.set_option('compute.ops_on_diff_frames', True)
         # to allow operations on different frames. We pass these warnings on 
to the user so they must manually set it themselves.
-        if level.crs != crs:
+        if crs:
             level.set_crs(crs, inplace=True, allow_override=True)
+            new_series = True
 
         frame._geometry_column_name = geo_column_name
-        frame[geo_column_name] = level
+        if new_series:
+            frame[geo_column_name] = level
 
         if not inplace:
             return frame
@@ -824,6 +788,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
 
         return gpd.GeoDataFrame(pd_df, geometry=self._geometry_column_name)
 
+    def to_spark_pandas(self) -> pspd.DataFrame:
+        """
+        Convert the GeoDataFrame to a Spark Pandas DataFrame.
+        """
+        return pspd.DataFrame(self._internal)
+
     @property
     def sindex(self) -> SpatialIndex | None:
         """
@@ -887,10 +857,184 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
 
     @crs.setter
     def crs(self, value):
-        # Avoid trying to access the geometry column (which might be missing) 
if crs is None
-        if value is None:
-            return
-        self.geometry.crs = value
+        # Since pyspark dataframes are immutable, we can't modify in place, so 
we create the new geoseries and replace it
+        self.geometry = self.geometry.set_crs(value)
+
+    def set_crs(self, crs, inplace=False, allow_override=True):
+        """
+        Set the Coordinate Reference System (CRS) of the ``GeoDataFrame``.
+
+        If there are multiple geometry columns within the GeoDataFrame, only
+        the CRS of the active geometry column is set.
+
+        Pass ``None`` to remove CRS from the active geometry column.
+
+        Notes
+        -----
+        The underlying geometries are not transformed to this CRS. To
+        transform the geometries to a new CRS, use the ``to_crs`` method.
+
+        Parameters
+        ----------
+        crs : pyproj.CRS | None, optional
+            The value can be anything accepted
+            by :meth:`pyproj.CRS.from_user_input() 
<pyproj.crs.CRS.from_user_input>`,
+            such as an authority string (eg "EPSG:4326") or a WKT string.
+        epsg : int, optional
+            EPSG code specifying the projection.
+        inplace : bool, default False
+            If True, the CRS of the GeoDataFrame will be changed in place
+            (while still returning the result) instead of making a copy of
+            the GeoDataFrame.
+        allow_override : bool, default True
+            If the GeoDataFrame already has a CRS, allow to replace the
+            existing CRS, even when both are not equal. In Sedona, setting 
this to True
+            will lead to eager evaluation instead of lazy evaluation. Unlike 
Geopandas,
+            True is the default value in Sedona for performance reasons.
+
+        Examples
+        --------
+        >>> from sedona.geopandas import GeoDataFrame
+        >>> from shapely.geometry import Point
+        >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), 
Point(2, 1)]}
+        >>> gdf = GeoDataFrame(d)
+        >>> gdf
+            col1     geometry
+        0  name1  POINT (1 2)
+        1  name2  POINT (2 1)
+
+        Setting CRS to a GeoDataFrame without one:
+
+        >>> gdf.crs is None
+        True
+
+        >>> gdf = gdf.set_crs('epsg:3857')
+        >>> gdf.crs  # doctest: +SKIP
+        <Projected CRS: EPSG:3857>
+        Name: WGS 84 / Pseudo-Mercator
+        Axis Info [cartesian]:
+        - X[east]: Easting (metre)
+        - Y[north]: Northing (metre)
+        Area of Use:
+        - name: World - 85°S to 85°N
+        - bounds: (-180.0, -85.06, 180.0, 85.06)
+        Coordinate Operation:
+        - name: Popular Visualisation Pseudo-Mercator
+        - method: Popular Visualisation Pseudo Mercator
+        Datum: World Geodetic System 1984
+        - Ellipsoid: WGS 84
+        - Prime Meridian: Greenwich
+
+        Overriding existing CRS:
+
+        >>> gdf = gdf.set_crs(4326, allow_override=True)
+
+        Without ``allow_override=True``, ``set_crs`` returns an error if you 
try to
+        override CRS.
+
+        See Also
+        --------
+        GeoDataFrame.to_crs : re-project to another CRS
+        """
+        # Since pyspark dataframes are immutable, we can't modify in place, so 
we create the new geoseries and replace it
+        new_geometry = self.geometry.set_crs(crs, 
allow_override=allow_override)
+        if inplace:
+            self.geometry = new_geometry
+        else:
+            df = self.copy()
+            df.geometry = new_geometry
+            return df
+
+    def to_crs(
+        self,
+        crs: Any | None = None,
+        epsg: int | None = None,
+        inplace: bool = False,
+    ) -> GeoDataFrame | None:
+        """Transform geometries to a new coordinate reference system.
+
+        Transform all geometries in an active geometry column to a different 
coordinate
+        reference system.  The ``crs`` attribute on the current GeoSeries must
+        be set.  Either ``crs`` or ``epsg`` may be specified for output.
+
+        This method will transform all points in all objects. It has no notion
+        of projecting entire geometries.  All segments joining points are
+        assumed to be lines in the current projection, not geodesics. Objects
+        crossing the dateline (or other projection boundary) will have
+        undesirable behavior.
+
+        Parameters
+        ----------
+        crs : pyproj.CRS, optional if `epsg` is specified
+            The value can be anything accepted by
+            :meth:`pyproj.CRS.from_user_input() 
<pyproj.crs.CRS.from_user_input>`,
+            such as an authority string (eg "EPSG:4326") or a WKT string.
+        epsg : int, optional if `crs` is specified
+            EPSG code specifying output projection.
+        inplace : bool, optional, default: False
+            Whether to return a new GeoDataFrame or do the transformation in
+            place.
+
+        Returns
+        -------
+        GeoDataFrame
+
+        Examples
+        --------
+        >>> from shapely.geometry import Point
+        >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), 
Point(2, 1)]}
+        >>> gdf = geopandas.GeoDataFrame(d, crs=4326)
+        >>> gdf
+            col1     geometry
+        0  name1  POINT (1 2)
+        1  name2  POINT (2 1)
+        >>> gdf.crs  # doctest: +SKIP
+        <Geographic 2D CRS: EPSG:4326>
+        Name: WGS 84
+        Axis Info [ellipsoidal]:
+        - Lat[north]: Geodetic latitude (degree)
+        - Lon[east]: Geodetic longitude (degree)
+        Area of Use:
+        - name: World
+        - bounds: (-180.0, -90.0, 180.0, 90.0)
+        Datum: World Geodetic System 1984
+        - Ellipsoid: WGS 84
+        - Prime Meridian: Greenwich
+
+        >>> gdf = gdf.to_crs(3857)
+        >>> gdf
+            col1                       geometry
+        0  name1  POINT (111319.491 222684.209)
+        1  name2  POINT (222638.982 111325.143)
+        >>> gdf.crs  # doctest: +SKIP
+        <Projected CRS: EPSG:3857>
+        Name: WGS 84 / Pseudo-Mercator
+        Axis Info [cartesian]:
+        - X[east]: Easting (metre)
+        - Y[north]: Northing (metre)
+        Area of Use:
+        - name: World - 85°S to 85°N
+        - bounds: (-180.0, -85.06, 180.0, 85.06)
+        Coordinate Operation:
+        - name: Popular Visualisation Pseudo-Mercator
+        - method: Popular Visualisation Pseudo Mercator
+        Datum: World Geodetic System 1984
+        - Ellipsoid: WGS 84
+        - Prime Meridian: Greenwich
+
+        See Also
+        --------
+        GeoDataFrame.set_crs : assign CRS without re-projection
+        """
+        new_geometry = self.geometry.to_crs(crs=crs, epsg=epsg)
+        if inplace:
+            df = self
+            df.geometry = new_geometry
+            return None
+        else:
+            df = self.copy()
+            df.geometry = new_geometry
+            return df
 
     @classmethod
     def from_dict(
diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index 2075895681..5f2b8dcb19 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import typing
 from typing import Any, Union, Literal, List
 
+import numpy as np
 import geopandas as gpd
 import sedona.geopandas as sgpd
 import pandas as pd
@@ -28,8 +28,8 @@ from pyspark.pandas import Series as PandasOnSparkSeries
 from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
 from pyspark.pandas.internal import InternalFrame
 from pyspark.pandas.series import first_series
-from pyspark.pandas.utils import scol_for, log_advice
-from pyspark.sql.types import BinaryType, NullType
+from pyspark.pandas.utils import scol_for
+from pyspark.sql.types import NullType
 from sedona.spark.sql.types import GeometryType
 
 from sedona.spark.sql import st_aggregates as sta
@@ -361,16 +361,18 @@ class GeoSeries(GeoFrame, pspd.Series):
             assert not copy
             assert not fastpath
 
-            data_crs = None
-            if hasattr(data, "crs"):
-                data_crs = data.crs
-            if data_crs is not None and crs is not None and data_crs != crs:
-                raise ValueError(
-                    "CRS mismatch between CRS of the passed geometries "
-                    "and 'crs'. Use 'GeoSeries.set_crs(crs, "
-                    "allow_override=True)' to overwrite CRS or "
-                    "'GeoSeries.to_crs(crs)' to reproject geometries. "
-                )
+            # We don't check crs validity to keep the operation lazy.
+            # Keep the original code for now
+            # data_crs = None
+            # if hasattr(data, "crs"):
+            #     data_crs = data.crs
+            # if data_crs is not None and crs is not None and data_crs != crs:
+            #     raise ValueError(
+            #         "CRS mismatch between CRS of the passed geometries "
+            #         "and 'crs'. Use 'GeoSeries.set_crs(crs, "
+            #         "allow_override=True)' to overwrite CRS or "
+            #         "'GeoSeries.to_crs(crs)' to reproject geometries. "
+            #     )
 
             # PySpark Pandas' ps.Series.__init__() does not construction from a
             # ps.Series input. For now, we manually implement the logic.
@@ -463,23 +465,36 @@ class GeoSeries(GeoFrame, pspd.Series):
         if len(self) == 0:
             return None
 
-        spark_col = stf.ST_SRID(self.spark.column)
+        if parse_version(pyspark.__version__) >= parse_version("3.5.0"):
+            spark_col = stf.ST_SRID(F.first_value(self.spark.column, 
ignoreNulls=True))
+            # Set this to avoid error complaining that we don't have a groupby 
column
+            is_aggr = True
+        else:
+            spark_col = stf.ST_SRID(self.spark.column)
+            is_aggr = False
+
         tmp_series = self._query_geometry_column(
             spark_col,
             returns_geom=False,
+            is_aggr=is_aggr,
         )
 
         # All geometries should have the same srid
         # so we just take the srid of the first non-null element
-        first_idx = tmp_series.first_valid_index()
-        srid = tmp_series[first_idx] if first_idx is not None else 0
+
+        if parse_version(pyspark.__version__) >= parse_version("3.5.0"):
+            srid = tmp_series.item()
+            # Turn np.nan to 0 to avoid error
+            srid = 0 if np.isnan(srid) else srid
+        else:
+            first_idx = tmp_series.first_valid_index()
+            srid = tmp_series[first_idx] if first_idx is not None else 0
 
         # Sedona returns 0 if doesn't exist
         return CRS.from_user_input(srid) if srid != 0 else None
 
     @crs.setter
     def crs(self, value: Union["CRS", None]):
-        # Implementation of the abstract method
         self.set_crs(value, inplace=True)
 
     @typing.overload
@@ -505,7 +520,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         crs: Union[Any, None] = None,
         epsg: Union[int, None] = None,
         inplace: bool = False,
-        allow_override: bool = False,
+        allow_override: bool = True,
     ) -> Union["GeoSeries", None]:
         """
         Set the Coordinate Reference System (CRS) of a ``GeoSeries``.
@@ -529,9 +544,11 @@ class GeoSeries(GeoFrame, pspd.Series):
             If True, the CRS of the GeoSeries will be changed in place
             (while still returning the result) instead of making a copy of
             the GeoSeries.
-        allow_override : bool, default False
+        allow_override : bool, default True
             If the GeoSeries already has a CRS, allow to replace the
-            existing CRS, even when both are not equal.
+            existing CRS, even when both are not equal. In Sedona, setting 
this to True
+            will lead to eager evaluation instead of lazy evaluation. Unlike 
Geopandas,
+            True is the default value in Sedona for performance reasons.
 
         Returns
         -------
@@ -589,19 +606,18 @@ class GeoSeries(GeoFrame, pspd.Series):
         elif epsg is not None:
             crs = CRS.from_epsg(epsg)
 
-        curr_crs = self.crs
-
-        # If CRS is the same, do nothing
-        if curr_crs == crs:
-            return
+        # The below block for the not allow_override case is eager due to the 
self.crs call
+        # This hurts performance and user experience, hence the default being 
set to True in Sedona
+        if not allow_override:
+            curr_crs = self.crs
 
-        if not allow_override and curr_crs is not None and not curr_crs == crs:
-            raise ValueError(
-                "The GeoSeries already has a CRS which is not equal to the 
passed "
-                "CRS. Specify 'allow_override=True' to allow replacing the 
existing "
-                "CRS without doing any transformation. If you actually want to 
"
-                "transform the geometries, use 'GeoSeries.to_crs' instead."
-            )
+            if curr_crs is not None and not curr_crs == crs:
+                raise ValueError(
+                    "The GeoSeries already has a CRS which is not equal to the 
passed "
+                    "CRS. Specify 'allow_override=True' to allow replacing the 
existing "
+                    "CRS without doing any transformation. If you actually 
want to "
+                    "transform the geometries, use 'GeoSeries.to_crs' instead."
+                )
 
         # 0 indicates no srid in sedona
         new_epsg = crs.to_epsg() if crs else 0
diff --git a/python/tests/geopandas/test_geodataframe.py 
b/python/tests/geopandas/test_geodataframe.py
index abda5010c3..bc50673d55 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -62,6 +62,16 @@ class TestGeoDataFrame(TestGeopandasBase):
             sgpd_df = GeoDataFrame(obj)
         check_geodataframe(sgpd_df)
 
+    def test_construct_from_geopandas(self):
+        gpd_df = gpd.GeoDataFrame(
+            {"geometry1": [Point(0, 0)]}, geometry="geometry1", crs="EPSG:3857"
+        )
+        with self.ps_allow_diff_frames():
+            sgpd_df = GeoDataFrame(gpd_df)
+        assert sgpd_df.crs == "EPSG:3857"
+        assert sgpd_df.geometry.crs == "EPSG:3857"
+        assert sgpd_df.geometry.name == "geometry1"
+
     @pytest.mark.parametrize(
         "obj",
         [
@@ -126,6 +136,57 @@ class TestGeoDataFrame(TestGeopandasBase):
         sgpd_df = sgpd.GeoDataFrame(obj)
         assert_frame_equal(pd_df, sgpd_df.to_pandas())
 
+    def test_to_geopandas(self):
+        from geopandas.testing import assert_geodataframe_equal
+
+        data = {"geometry": [Point(x, x) for x in range(3)], "id": [1, 2, 3]}
+        index = [1, 2, 3]
+        crs = "EPSG:3857"
+        # TODO: try to optimize this away
+        with self.ps_allow_diff_frames():
+            result = GeoDataFrame(data, index=index, crs=crs).to_geopandas()
+        gpd_df = gpd.GeoDataFrame(data, index=index, crs=crs)
+        assert_geodataframe_equal(result, gpd_df)
+
+    def test_to_spark_pandas(self):
+        data = {"geometry": [Point(x, x) for x in range(3)], "id": [1, 2, 3]}
+        index = [1, 2, 3]
+        result = GeoDataFrame(data, index=index).to_spark_pandas()
+        ps_df = ps.DataFrame(data, index=index)
+        assert_frame_equal(result.to_pandas(), ps_df.to_pandas())
+
+    def test_getitem(self):
+        geoms = [Point(x, x) for x in range(3)]
+        ids = [1, 2, 3]
+        values = ["a", "b", "c"]
+        crs = "EPSG:3857"
+
+        with self.ps_allow_diff_frames():
+            df = GeoDataFrame({"geometry": geoms, "id": ids, "value": values}, 
crs=crs)
+
+        # get a single non-geometry series
+        result = df["id"]
+        expected = pd.Series(ids, name="id")
+        self.check_pd_series_equal(result, expected)
+
+        # get a single geometry series
+        result = df["geometry"]
+        expected = gpd.GeoSeries(geoms, name="geometry", crs=crs)
+        self.check_sgpd_equals_gpd(result, expected)
+
+        # get multiple columns
+        result = df[["id", "value"]]
+        # no crs because no geometry column
+        expected = gpd.GeoDataFrame({"id": ids, "value": values})
+        self.check_sgpd_df_equals_gpd_df(result, expected)
+
+        # get numerical slice
+        result = df[:2]
+        expected = gpd.GeoDataFrame(
+            {"geometry": geoms[:2], "id": ids[:2], "value": values[:2]}, 
crs=crs
+        )
+        self.check_sgpd_df_equals_gpd_df(result, expected)
+
     def test_plot(self):
         # Just make sure it doesn't error
         df = GeoDataFrame(
@@ -220,6 +281,50 @@ class TestGeoDataFrame(TestGeopandasBase):
         df_copy = df.copy()
         assert type(df_copy) is GeoDataFrame
 
+    def test_set_crs(self):
+        sgpd_df = sgpd.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+        with self.ps_allow_diff_frames():
+            sgpd_df.crs = 4326
+        assert sgpd_df.crs.to_epsg() == 4326
+
+        with self.ps_allow_diff_frames():
+            sgpd_df.set_crs(3857, inplace=True, allow_override=True)
+        assert sgpd_df.crs.to_epsg() == 3857
+
+        with self.ps_allow_diff_frames():
+            sgpd_df = sgpd_df.set_crs(None, allow_override=True)
+        assert isinstance(sgpd_df, GeoDataFrame)
+        assert sgpd_df.crs is None
+
+        with self.ps_allow_diff_frames():
+            sgpd_df = sgpd_df.set_crs(4326, allow_override=True)
+        assert isinstance(sgpd_df, GeoDataFrame)
+        assert sgpd_df.crs.to_epsg() == 4326
+
+    def test_to_crs(self):
+        from pyproj import CRS
+
+        with self.ps_allow_diff_frames():
+            gdf = sgpd.GeoDataFrame(
+                {"geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]}, crs=4326
+            )
+        assert isinstance(gdf.crs, CRS) and gdf.crs.to_epsg() == 4326
+
+        with self.ps_allow_diff_frames():
+            result = gdf.to_crs(3857)
+        assert isinstance(result.crs, CRS) and result.crs.to_epsg() == 3857
+
+        expected = gpd.GeoSeries(
+            [
+                Point(111319.49079327356, 111325.14286638486),
+                Point(222638.98158654712, 222684.20850554455),
+                Point(333958.4723798207, 334111.1714019597),
+            ],
+            name="geometry",
+            crs=3857,
+        )
+        self.check_sgpd_equals_gpd(result.geometry, expected)
+
     def test_set_geometry(self):
         from sedona.geopandas.geodataframe import MissingGeometryColumnError
 
@@ -279,6 +384,21 @@ class TestGeoDataFrame(TestGeopandasBase):
         assert new_df.crs == sgpd_df.crs
         assert new_df.geometry.crs == sgpd_df.crs
 
+    def test_set_geometry_crs(self):
+        df = GeoDataFrame({"geometry1": [Point(0, 0)]})
+        with self.ps_allow_diff_frames():
+            df.set_geometry("geometry1", crs="EPSG:3857", inplace=True)
+        assert df.crs == "EPSG:3857"
+        assert df.geometry.crs == "EPSG:3857"
+
+        with self.ps_allow_diff_frames():
+            df = GeoDataFrame(
+                {"geometry1": [Point(0, 0)]}, geometry="geometry1", 
crs="EPSG:3857"
+            )
+
+        assert df.crs == "EPSG:3857"
+        assert df.geometry.crs == "EPSG:3857"
+
     def test_active_geometry_name(self):
         if parse_version(gpd.__version__) < parse_version("1.0.0"):
             return
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index d73545f08a..5f9b439874 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -95,6 +95,24 @@ class TestGeoSeries(TestGeopandasBase):
         sgpd_series = GeoSeries(obj)
         assert isinstance(sgpd_series, sgpd.GeoSeries)
 
+    def test_to_geopandas(self):
+        from geopandas.testing import assert_geoseries_equal
+
+        data = [Point(x, x) for x in range(3)]
+        index = [1, 2, 3]
+        crs = "EPSG:3857"
+        result = GeoSeries(data, index=index, crs=crs).to_geopandas()
+        gpd_df = gpd.GeoSeries(data, index=index, crs=crs)
+        assert_geoseries_equal(result, gpd_df)
+
+    def test_to_spark_pandas(self):
+        data = [Point(x, x) for x in range(3)]
+        index = [1, 2, 3]
+        crs = "EPSG:3857"
+        result = GeoSeries(data, index=index, crs=crs).to_spark_pandas()
+        ps_df = ps.Series(data, index=index)
+        assert_series_equal(result.to_pandas(), ps_df.to_pandas())
+
     def test_plot(self):
         # Just make sure it doesn't error
         self.geoseries.plot()
@@ -1642,9 +1660,9 @@ e": "Feature", "properties": {}, "geometry": {"type": 
"Point", "coordinates": [3
         assert geo_series.crs.to_epsg() == 4326
 
         with pytest.raises(ValueError):
-            geo_series.set_crs(4328)
+            geo_series.set_crs(4328, allow_override=False)
         with pytest.raises(ValueError):
-            geo_series.crs = None
+            geo_series.set_crs(None, allow_override=False)
 
         geo_series = geo_series.set_crs(None, allow_override=True)
         assert geo_series.crs == None

(sedona) branch master updated: [GH-2201] Geopandas: Optimize crs operations, getitem, and set_geometry + impl scalable getitem (#2203)

Reply via email to