(sedona) branch master updated: [GH-2006] Geopandas: set_geometry, rename_geometry, active_geometry_name + refactor tests (#2058)

jiayu Tue, 15 Jul 2025 11:53:19 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new da82434b93 [GH-2006] Geopandas: set_geometry, rename_geometry, 
active_geometry_name + refactor tests (#2058)
da82434b93 is described below

commit da82434b9353fd1e79a32dae9e9832b92140f01a
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Jul 15 11:52:55 2025 -0700

    [GH-2006] Geopandas: set_geometry, rename_geometry, active_geometry_name + 
refactor tests (#2058)
    
    * Implement basic set_geometry and refactor tests
    
    * Skip active_geometry_name for old versions
    
    * Implement rename_geometry and update correct version for 
active_geometry_name
    
    * Fix version skipping logic in tests
    
    * Clean up
    
    * Fix tests intersection, from_*, and set_geometry after the merge
    
    * Implement rest of logic to pass the crs tests in set_geometry
    
    * clean up
    
    * Fix some tests (all except rename_geometry)
    
    * Fixes to pass tests
    
    * Remove check for option already set
    
    * Clean up unused (commented) setup method
---
 python/sedona/geopandas/base.py                    |   5 -
 python/sedona/geopandas/geodataframe.py            | 479 ++++++++++++++++++---
 python/sedona/geopandas/geoseries.py               |  27 +-
 python/tests/geopandas/test_geodataframe.py        | 123 +++++-
 python/tests/geopandas/test_geopandas_base.py      |  98 +++++
 python/tests/geopandas/test_geoseries.py           |  17 +-
 .../geopandas/test_match_geopandas_dataframe.py    | 173 ++++++++
 .../tests/geopandas/test_match_geopandas_series.py |  41 +-
 8 files changed, 825 insertions(+), 138 deletions(-)

diff --git a/python/sedona/geopandas/base.py b/python/sedona/geopandas/base.py
index 8bc1ad4dd3..65974efbff 100644
--- a/python/sedona/geopandas/base.py
+++ b/python/sedona/geopandas/base.py
@@ -60,11 +60,6 @@ class GeoFrame(metaclass=ABCMeta):
     ) -> Union["GeoSeries", Scalar]:
         raise NotImplementedError("This method is not implemented yet.")
 
-    @property
-    @abstractmethod
-    def dtypes(self) -> Union[gpd.GeoSeries, pd.Series, Dtype]:
-        raise NotImplementedError("This method is not implemented yet.")
-
     @abstractmethod
     def to_geopandas(self) -> Union[gpd.GeoDataFrame, pd.Series]:
         raise NotImplementedError("This method is not implemented yet.")
diff --git a/python/sedona/geopandas/geodataframe.py 
b/python/sedona/geopandas/geodataframe.py
index b627d06a7d..0305153ec0 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -16,11 +16,16 @@
 # under the License.
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, Literal, Callable, Union
+import typing
 
+import warnings
+import numpy as np
+import shapely
 import geopandas as gpd
 import pandas as pd
 import pyspark.pandas as pspd
+import sedona.geopandas as sgpd
 from pyspark.pandas import Series as PandasOnSparkSeries
 from pyspark.pandas._typing import Dtype
 from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
@@ -30,6 +35,13 @@ from sedona.geopandas._typing import Label
 from sedona.geopandas.base import GeoFrame
 from sedona.geopandas.sindex import SpatialIndex
 
+from pandas.api.extensions import register_extension_dtype
+from geopandas.geodataframe import crs_mismatch_error
+from geopandas.array import GeometryDtype
+from shapely.geometry.base import BaseGeometry
+
+register_extension_dtype(GeometryDtype)
+
 
 class GeoDataFrame(GeoFrame, pspd.DataFrame):
     """
@@ -64,7 +76,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         1    2
         Name: value, dtype: int64
         """
-        from sedona.geopandas import GeoSeries
 
         # Handle column access by name
         if isinstance(key, str):
@@ -75,27 +86,15 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
             if column_name not in self.columns:
                 raise KeyError(f"Column '{column_name}' does not exist")
 
-            # Get column data from spark_frame
-            spark_df = self._internal.spark_frame.select(column_name)
-            pandas_df = spark_df.toPandas()
-
-            # Check if this is a geometry column
-            field = next(
-                (f for f in self._internal.spark_frame.schema.fields if f.name 
== key),
-                None,
-            )
-
-            if field and (
-                field.dataType.typeName() == "geometrytype"
-                or field.dataType.typeName() == "binary"
-            ):
-                # Return as GeoSeries for geometry columns
-                return GeoSeries(pandas_df[column_name])
-            else:
-                # Return as regular pandas Series for non-geometry columns
-                from pyspark.pandas import Series
+            # Here we are getting a ps.Series with the same underlying anchor 
(ps.Dataframe).
+            # This is important so we don't unnecessarily try to perform 
operations on different dataframes
+            ps_series = pspd.DataFrame.__getitem__(self, column_name)
 
-                return Series(pandas_df[column_name])
+            try:
+                result = sgpd.GeoSeries(ps_series)
+                return result
+            except:
+                return ps_series
 
         # Handle list of column names
         elif isinstance(key, list) and all(isinstance(k, str) for k in key):
@@ -119,6 +118,8 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
             selected_rows = pandas_df[key]
             return GeoDataFrame(selected_rows)
 
+    _geometry_column_name = None
+
     def __init__(
         self,
         data=None,
@@ -138,19 +139,65 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         from sedona.geopandas import GeoSeries
         from pyspark.sql import DataFrame as SparkDataFrame
 
-        if isinstance(data, (GeoDataFrame, GeoSeries)):
-            assert dtype is None
-            assert not copy
-            super().__init__(data, index=index, dtype=dtype, copy=copy)
-        elif isinstance(data, (PandasOnSparkSeries, PandasOnSparkDataFrame)):
-            assert columns is None
-            assert dtype is None
-            assert not copy
-            super().__init__(data, index=index, dtype=dtype)
+        # Simplified version of the function from GeoSeries.__init__()
+        def try_geom_to_ewkb(x) -> bytes:
+            if isinstance(x, BaseGeometry):
+                kwargs = {}
+                if crs:
+                    from pyproj import CRS
+
+                    srid = CRS.from_user_input(crs)
+                    kwargs["srid"] = srid.to_epsg()
+
+                return shapely.wkb.dumps(x, **kwargs)
+            elif isinstance(x, bytearray):
+                return bytes(x)
+            elif x is None or isinstance(x, bytes):
+                return x
+            else:
+                raise TypeError(f"expected geometry or bytes, got {type(x)}: 
{x}")
+
+        def safe_apply_to_each_column(
+            df: pd.DataFrame | pspd.DataFrame, func: Callable
+        ):
+            # try except won't work here for ps.DataFrame case, so we use 
first_valid_index() instead
+            for col in df.columns:
+                first_idx = df[col].first_valid_index()
+                if first_idx is not None and isinstance(
+                    df[col][first_idx], BaseGeometry
+                ):
+                    df[col] = df[col].apply(try_geom_to_ewkb)
+            return df
+
+        if isinstance(data, GeoDataFrame):
+            if data._safe_get_crs() is None:
+                data.crs = crs
+        elif isinstance(data, GeoSeries):
+            if data.crs is None:
+                data.crs = crs
+
+            # For each of these super().__init__() calls, we let pyspark 
decide which inputs are valid or not
+            # instead of calling e.g assert not dtype ourselves.
+            # This way, if Spark adds support later, than we inherit those 
changes naturally
+            super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
+        elif isinstance(data, PandasOnSparkDataFrame):
+
+            data = safe_apply_to_each_column(data, try_geom_to_ewkb)
+
+            super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
+        elif isinstance(data, PandasOnSparkSeries):
+
+            try:
+                data = GeoSeries(data)
+            except TypeError:
+                pass
+
+            super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
         elif isinstance(data, SparkDataFrame):
             assert columns is None
             assert dtype is None
             assert not copy
+
             if index is None:
                 internal = InternalFrame(spark_frame=data, 
index_spark_columns=None)
                 object.__setattr__(self, "_internal_frame", internal)
@@ -170,15 +217,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                 )
             gdf = gpd.GeoDataFrame(df)
             # convert each geometry column to wkb type
-            import shapely
 
-            for col in gdf.columns:
-                # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, 
etc of shapely.Geometry objects.
-                if len(gdf[col]) > 0 and isinstance(
-                    gdf[col].iloc[0], shapely.geometry.base.BaseGeometry
-                ):
-                    gdf[col] = gdf[col].apply(lambda geom: geom.wkb)
+            # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc 
of shapely.Geometry objects.
+            gdf = safe_apply_to_each_column(gdf, try_geom_to_ewkb)
+
             pdf = pd.DataFrame(gdf)
+
             # initialize the parent class pyspark Dataframe with the pandas 
Series
             super().__init__(
                 data=pdf,
@@ -187,6 +231,315 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                 copy=copy,
             )
 
+        if isinstance(data, (GeoDataFrame, gpd.GeoDataFrame)):
+            self._geometry_column_name = data._geometry_column_name
+            if crs is not None and data.crs != crs:
+                raise ValueError(crs_mismatch_error)
+
+    def _get_geometry(self) -> sgpd.GeoSeries:
+        if self._geometry_column_name not in self:
+            if self._geometry_column_name is None:
+                msg = (
+                    "You are calling a geospatial method on the GeoDataFrame, "
+                    "but the active geometry column to use has not been set. "
+                )
+            else:
+                msg = (
+                    "You are calling a geospatial method on the GeoDataFrame, "
+                    f"but the active geometry column 
('{self._geometry_column_name}') "
+                    "is not present. "
+                )
+            geo_cols = list(self.columns[self.dtypes == "geometry"])
+            if len(geo_cols) > 0:
+                msg += (
+                    f"\nThere are columns with geometry data type 
({geo_cols}), and "
+                    "you can either set one as the active geometry with "
+                    'df.set_geometry("name") or access the column as a '
+                    'GeoSeries (df["name"]) and call the method directly on 
it.'
+                )
+            else:
+                msg += (
+                    "\nThere are no existing columns with geometry data type. 
You can "
+                    "add a geometry column as the active geometry column with "
+                    "df.set_geometry. "
+                )
+
+            raise AttributeError(msg)
+        return self[self._geometry_column_name]
+
+    def _set_geometry(self, col):
+        # This check is included in the original geopandas. Note that this 
prevents assigning a str to the property
+        # e.g. df.geometry = "geometry"
+        # However the user can still use specify a str in the public 
.set_geometry() method
+        # ie. df.geometry = "geometry1" errors, but 
df.set_geometry("geometry1") works
+        if not pd.api.types.is_list_like(col):
+            raise ValueError("Must use a list-like to set the geometry 
property")
+        self.set_geometry(col, inplace=True)
+
+    geometry = property(
+        fget=_get_geometry, fset=_set_geometry, doc="Geometry data for 
GeoDataFrame"
+    )
+
+    @typing.overload
+    def set_geometry(
+        self,
+        col,
+        drop: bool | None = ...,
+        inplace: Literal[True] = ...,
+        crs: Any | None = ...,
+    ) -> None: ...
+
+    @typing.overload
+    def set_geometry(
+        self,
+        col,
+        drop: bool | None = ...,
+        inplace: Literal[False] = ...,
+        crs: Any | None = ...,
+    ) -> GeoDataFrame: ...
+
+    def set_geometry(
+        self,
+        col,
+        drop: bool | None = None,
+        inplace: bool = False,
+        crs: Any | None = None,
+    ) -> GeoDataFrame | None:
+        """
+        Set the GeoDataFrame geometry using either an existing column or
+        the specified input. By default yields a new object.
+
+        The original geometry column is replaced with the input.
+
+        Parameters
+        ----------
+        col : column label or array-like
+            An existing column name or values to set as the new geometry 
column.
+            If values (array-like, (Geo)Series) are passed, then if they are 
named
+            (Series) the new geometry column will have the corresponding name,
+            otherwise the existing geometry column will be replaced. If there 
is
+            no existing geometry column, the new geometry column will use the
+            default name "geometry".
+        drop : boolean, default False
+            When specifying a named Series or an existing column name for 
`col`,
+            controls if the previous geometry column should be dropped from the
+            result. The default of False keeps both the old and new geometry 
column.
+
+            .. deprecated:: 1.0.0
+
+        inplace : boolean, default False
+            Modify the GeoDataFrame in place (do not create a new object)
+        crs : pyproj.CRS, optional
+            Coordinate system to use. The value can be anything accepted
+            by :meth:`pyproj.CRS.from_user_input() 
<pyproj.crs.CRS.from_user_input>`,
+            such as an authority string (eg "EPSG:4326") or a WKT string.
+            If passed, overrides both DataFrame and col's crs.
+            Otherwise, tries to get crs from passed col values or DataFrame.
+
+        Examples
+        --------
+        >>> from sedona.geopandas import GeoDataFrame
+        >>> from shapely.geometry import Point
+        >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), 
Point(2, 1)]}
+        >>> gdf = GeoDataFrame(d, crs="EPSG:4326")
+        >>> gdf
+            col1     geometry
+        0  name1  POINT (1 2)
+        1  name2  POINT (2 1)
+
+        Passing an array:
+
+        >>> df1 = gdf.set_geometry([Point(0,0), Point(1,1)])
+        >>> df1
+            col1     geometry
+        0  name1  POINT (0 0)
+        1  name2  POINT (1 1)
+
+        Using existing column:
+
+        >>> gdf["buffered"] = gdf.buffer(2)
+        >>> df2 = gdf.set_geometry("buffered")
+        >>> df2.geometry
+        0    POLYGON ((3 2, 2.99037 1.80397, 2.96157 1.6098...
+        1    POLYGON ((4 1, 3.99037 0.80397, 3.96157 0.6098...
+        Name: buffered, dtype: geometry
+
+        Returns
+        -------
+        GeoDataFrame
+
+        See also
+        --------
+        GeoDataFrame.rename_geometry : rename an active geometry column
+        """
+        # Most of the code here is taken from DataFrame.set_index()
+        if inplace:
+            frame = self
+        else:
+            frame = self.copy(deep=False)
+
+        geo_column_name = self._geometry_column_name
+
+        if geo_column_name is None:
+            geo_column_name = "geometry"
+
+        if isinstance(
+            col, (pspd.Series, pd.Series, list, np.ndarray, 
gpd.array.GeometryArray)
+        ):
+            if drop:
+                msg = (
+                    "The `drop` keyword argument is deprecated and has no 
effect when "
+                    "`col` is an array-like value. You should stop passing 
`drop` to "
+                    "`set_geometry` when this is the case."
+                )
+                warnings.warn(msg, category=FutureWarning, stacklevel=2)
+            if isinstance(col, (pspd.Series, pd.Series)) and col.name is not 
None:
+                geo_column_name = col.name
+
+            level = col
+
+            if not isinstance(level, pspd.Series):
+                level = pspd.Series(level)
+        elif hasattr(col, "ndim") and col.ndim > 1:
+            raise ValueError("Must pass array with one dimension only.")
+        else:  # should be a colname
+            try:
+                level = frame[col]
+            except KeyError:
+                raise ValueError(f"Unknown column {col}")
+            if isinstance(level, (sgpd.GeoDataFrame, gpd.GeoDataFrame)):
+                raise ValueError(
+                    "GeoDataFrame does not support setting the geometry column 
where "
+                    "the column name is shared by multiple columns."
+                )
+
+            given_colname_drop_msg = (
+                "The `drop` keyword argument is deprecated and in future the 
only "
+                "supported behaviour will match drop=False. To silence this "
+                "warning and adopt the future behaviour, stop providing "
+                "`drop` as a keyword to `set_geometry`. To replicate the "
+                "`drop=True` behaviour you should update "
+                "your code to\n`geo_col_name = gdf.active_geometry_name;"
+                " gdf.set_geometry(new_geo_col).drop("
+                "columns=geo_col_name).rename_geometry(geo_col_name)`."
+            )
+
+            if drop is False:  # specifically False, not falsy i.e. None
+                # User supplied False explicitly, but arg is deprecated
+                warnings.warn(
+                    given_colname_drop_msg,
+                    category=FutureWarning,
+                    stacklevel=2,
+                )
+            if drop:
+                raise NotImplementedError("Not implemented.")
+            else:
+                # if not dropping, set the active geometry name to the given 
col name
+                geo_column_name = col
+
+        if not crs:
+            crs = getattr(level, "crs", None)
+
+        # Check that we are using a listlike of geometries
+        level = _ensure_geometry(level, crs=crs)
+        # ensure_geometry only sets crs on level if it has crs==None
+
+        # This operation throws a warning to the user asking them to set 
pspd.set_option('compute.ops_on_diff_frames', True)
+        # to allow operations on different frames. We pass these warnings on 
to the user so they must manually set it themselves.
+        if level.crs != crs:
+            level.set_crs(crs, inplace=True, allow_override=True)
+
+        frame._geometry_column_name = geo_column_name
+        frame[geo_column_name] = level
+
+        if not inplace:
+            return frame
+
+    @typing.overload
+    def rename_geometry(
+        self,
+        col: str,
+        inplace: Literal[True] = ...,
+    ) -> None: ...
+
+    @typing.overload
+    def rename_geometry(
+        self,
+        col: str,
+        inplace: Literal[False] = ...,
+    ) -> GeoDataFrame: ...
+
+    def rename_geometry(self, col: str, inplace: bool = False) -> GeoDataFrame 
| None:
+        """
+        Renames the GeoDataFrame geometry column to
+        the specified name. By default yields a new object.
+
+        The original geometry column is replaced with the input.
+
+        Parameters
+        ----------
+        col : new geometry column label
+        inplace : boolean, default False
+            Modify the GeoDataFrame in place (without creating a new object)
+
+        Examples
+        --------
+        >>> from sedona.geopandas import GeoDataFrame
+        >>> from shapely.geometry import Point
+        >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), 
Point(2, 1)]}
+        >>> df = GeoDataFrame(d, crs="EPSG:4326")
+        >>> df1 = df.rename_geometry('geom1')
+        >>> df1.geometry.name
+        'geom1'
+        >>> df.rename_geometry('geom1', inplace=True)
+        >>> df.geometry.name
+        'geom1'
+
+
+        See also
+        --------
+        GeoDataFrame.set_geometry : set the active geometry
+        """
+        geometry_col = self.geometry.name
+        if col in self.columns:
+            raise ValueError(f"Column named {col} already exists")
+        else:
+            if inplace:
+                self.rename(columns={geometry_col: col}, inplace=inplace)
+                self.set_geometry(col, inplace=inplace)
+                return None
+
+            # The same .rename().set_geometry() logic errors for this case, so 
we do it manually instead
+            ps_series = self._psser_for((geometry_col,)).rename(col)
+            sdf = self.copy()
+            sdf[col] = ps_series
+            sdf = sdf.set_geometry(col)
+            return sdf
+
+    @property
+    def active_geometry_name(self) -> Any:
+        """Return the name of the active geometry column
+
+        Returns a name if a GeoDataFrame has an active geometry column set,
+        otherwise returns None. The return type is usually a string, but may be
+        an integer, tuple or other hashable, depending on the contents of the
+        dataframe columns.
+
+        You can also access the active geometry column using the
+        ``.geometry`` property. You can set a GeoSeries to be an active 
geometry
+        using the :meth:`~GeoDataFrame.set_geometry` method.
+
+        Returns
+        -------
+        str or other index label supported by pandas
+            name of an active geometry column or None
+
+        See also
+        --------
+        GeoDataFrame.set_geometry : set the active geometry
+        """
+        return self._geometry_column_name
+
     def _process_geometry_columns(
         self, operation: str, rename_suffix: str = "", *args, **kwargs
     ) -> GeoDataFrame:
@@ -236,11 +589,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         sdf = self._internal.spark_frame.selectExpr(*select_expressions)
         return GeoDataFrame(sdf)
 
-    @property
-    def dtypes(self) -> gpd.GeoSeries | pd.Series | Dtype:
-        # Implementation of the abstract method
-        raise NotImplementedError("This method is not implemented yet.")
-
     def to_geopandas(self) -> gpd.GeoDataFrame | pd.Series:
         # Implementation of the abstract method
         raise NotImplementedError("This method is not implemented yet.")
@@ -291,7 +639,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                 self._anchor.copy(), dtype=self.dtypes, index=self._col_label
             )
         else:
-            return self
+            return self  # GeoDataFrame(self._internal.spark_frame.copy())  
"this parameter is not supported but just dummy parameter to match pandas."
 
     @property
     def area(self) -> GeoDataFrame:
@@ -320,15 +668,26 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         """
         return self._process_geometry_columns("ST_Area", rename_suffix="_area")
 
+    def _safe_get_crs(self):
+        """
+        Helper method for getting the crs of the GeoDataframe safely.
+        Returns None if no geometry column is set instead of raising an error.
+        """
+        try:
+            return self.geometry.crs
+        except AttributeError:
+            return None
+
     @property
     def crs(self):
-        # Implementation of the abstract method
-        raise NotImplementedError("This method is not implemented yet.")
+        return self.geometry.crs
 
     @crs.setter
     def crs(self, value):
-        # Implementation of the abstract method
-        raise NotImplementedError("This method is not implemented yet.")
+        # Avoid trying to access the geometry column (which might be missing) 
if crs is None
+        if value is None:
+            return
+        self.geometry.crs = value
 
     @property
     def geom_type(self):
@@ -666,3 +1025,27 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         """
         # Use the Spark DataFrame's write method to write to GeoParquet format
         self._internal.spark_frame.write.format("geoparquet").save(path, 
**kwargs)
+
+
+# -----------------------------------------------------------------------------
+# # Utils
+# -----------------------------------------------------------------------------
+
+
+def _ensure_geometry(data, crs: Any | None = None) -> sgpd.GeoSeries:
+    """
+    Ensure the data is of geometry dtype or converted to it.
+
+    If input is a (Geo)Series, output is a GeoSeries, otherwise output
+    is GeometryArray.
+
+    If the input is a GeometryDtype with a set CRS, `crs` is ignored.
+    """
+    if isinstance(data, sgpd.GeoSeries):
+        if data.crs is None and crs is not None:
+            # Avoids caching issues/crs sharing issues
+            data = data.copy()
+            data.crs = crs
+        return data
+    else:
+        return sgpd.GeoSeries(data, crs=crs)
diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index a94302d89b..821e1ab324 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -117,7 +117,10 @@ class GeoSeries(GeoFrame, pspd.Series):
         self._anchor: GeoDataFrame
         self._col_label: Label
 
+        use_same_anchor = True
+
         def try_geom_to_ewkb(x) -> bytes:
+            nonlocal use_same_anchor
             if isinstance(x, BaseGeometry):
                 kwargs = {}
                 if crs:
@@ -125,9 +128,11 @@ class GeoSeries(GeoFrame, pspd.Series):
 
                     srid = CRS.from_user_input(crs)
                     kwargs["srid"] = srid.to_epsg()
+                use_same_anchor = False
 
                 return shapely.wkb.dumps(x, **kwargs)
             elif isinstance(x, bytearray):
+                use_same_anchor = False
                 return bytes(x)
             elif x is None or isinstance(x, bytes):
                 return x
@@ -172,6 +177,9 @@ class GeoSeries(GeoFrame, pspd.Series):
                 copy=copy,
                 fastpath=fastpath,
             )
+
+            if use_same_anchor:
+                self._anchor = data
         else:
             if isinstance(data, pd.Series):
                 assert index is None
@@ -472,14 +480,8 @@ class GeoSeries(GeoFrame, pspd.Series):
             if isinstance(data_type, BinaryType):
                 query = query.replace(f"`{cols}`", f"ST_GeomFromWKB(`{cols}`)")
 
-            # Convert back to EWKB format if the return type is a geometry
-            if returns_geom:
-                query = f"ST_AsEWKB({query})"
-
             rename = col if not rename else rename
 
-            query = f"{query} as `{rename}`"
-
         elif isinstance(cols, list):
             for col in cols:
                 data_type = df.schema[col].dataType
@@ -491,7 +493,11 @@ class GeoSeries(GeoFrame, pspd.Series):
             # must have rename for multiple columns since we don't know which 
name to default to
             assert rename
 
-            query = f"{query} as `{rename}`"
+        # Convert back to EWKB format if the return type is a geometry
+        if returns_geom:
+            query = f"ST_AsEWKB({query})"
+
+        query = f"{query} as `{rename}`"
 
         # We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to 
regenerate it in the result
         # We always select SPARK_DEFAULT_INDEX_NAME, to retain series index 
info
@@ -509,11 +515,6 @@ class GeoSeries(GeoFrame, pspd.Series):
 
         return GeoSeries(ps_series) if returns_geom else ps_series
 
-    @property
-    def dtypes(self) -> Union[gpd.GeoSeries, pd.Series, Dtype]:
-        # Implementation of the abstract method
-        raise NotImplementedError("This method is not implemented yet.")
-
     def to_geopandas(self) -> gpd.GeoSeries:
         """
         Convert the GeoSeries to a geopandas GeoSeries.
@@ -2090,7 +2091,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         if isinstance(data, list) and not isinstance(data[0], (tuple, list)):
             data = [(obj,) for obj in data]
 
-        select = f"{select} as geometry"
+        select = f"ST_AsEWKB({select}) as geometry"
 
         spark_df = default_session().createDataFrame(data, schema=schema)
         spark_df = spark_df.selectExpr(select)
diff --git a/python/tests/geopandas/test_geodataframe.py 
b/python/tests/geopandas/test_geodataframe.py
index 33e0041dc2..a6c208f694 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -22,30 +22,17 @@ from shapely.geometry import (
 )
 
 from sedona.geopandas import GeoDataFrame, GeoSeries
-from tests.test_base import TestBase
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
 import pyspark.pandas as ps
 import pandas as pd
 import geopandas as gpd
 import sedona.geopandas as sgpd
 import pytest
-from pandas.testing import assert_frame_equal
-
-
-class TestDataframe(TestBase):
-    # def setup_method(self):
-    #     N = 10
-    #     self.tempdir = tempfile.mkdtemp()
-    #     self.crs = "epsg:4326"
-    #     self.df = GeoDataFrame(
-    #         [
-    #             {"geometry": Point(x, y), "value1": x + y, "value2": x * y}
-    #             for x, y in zip(range(N), range(N))
-    #         ],
-    #         crs=self.crs,
-    #     )
-    #
-    # def teardown_method(self):
-    #     shutil.rmtree(self.tempdir)
+from pandas.testing import assert_frame_equal, assert_series_equal
+from packaging.version import parse as parse_version
+
+
+class TestDataframe(TestGeopandasBase):
     @pytest.mark.parametrize(
         "obj",
         [
@@ -172,6 +159,104 @@ class TestDataframe(TestBase):
         df_copy = df.copy()
         assert type(df_copy) is GeoDataFrame
 
+    def test_set_geometry(self):
+        points1 = [Point(x, x) for x in range(3)]
+        points2 = [Point(x + 5, x + 5) for x in range(3)]
+
+        data = {"geometry1": points1, "geometry2": points2, "attribute": [1, 
2, 3]}
+        sgpd_df = sgpd.GeoDataFrame(data)
+
+        # No geometry column set yet
+        with pytest.raises(AttributeError):
+            _ = sgpd_df.geometry
+
+        # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+        with self.ps_allow_diff_frames():
+            sgpd_df = sgpd_df.set_geometry("geometry1")
+
+        assert sgpd_df.geometry.name == "geometry1"
+
+        # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+        with self.ps_allow_diff_frames():
+            sgpd_df.set_geometry("geometry2", inplace=True)
+        assert sgpd_df.geometry.name == "geometry2"
+
+        # Test the actual values of the geometry column
+        assert_series_equal(
+            sgpd_df.geometry.area.to_pandas(), 
sgpd_df["geometry2"].area.to_pandas()
+        )
+
+        # unknown column
+        with pytest.raises(ValueError):
+            sgpd_df.set_geometry("nonexistent-column")
+
+        geom = GeoSeries(
+            [Point(x, y) for x, y in zip(range(5), range(5))], name="geometry2"
+        )
+
+        # new crs - setting should default to GeoSeries' crs
+        gs = GeoSeries(geom, crs="epsg:3857")
+
+        with self.ps_allow_diff_frames():
+            new_df = sgpd_df.set_geometry(gs)
+
+        assert new_df.crs == "epsg:3857"
+
+        # explicit crs overrides self and dataframe
+        with self.ps_allow_diff_frames():
+            new_df = sgpd_df.set_geometry(gs, crs="epsg:26909")
+
+        assert new_df.crs == "epsg:26909"
+        assert new_df.geometry.crs == "epsg:26909"
+
+        # Series should use dataframe's crs
+        with self.ps_allow_diff_frames():
+            new_df = sgpd_df.set_geometry(geom.values)
+
+        assert new_df.crs == sgpd_df.crs
+        assert new_df.geometry.crs == sgpd_df.crs
+
+    def test_active_geometry_name(self):
+        if parse_version(gpd.__version__) < parse_version("1.0.0"):
+            return
+
+        points1 = [Point(x, x) for x in range(3)]
+        points2 = [Point(x + 5, x + 5) for x in range(3)]
+
+        data = {"geometry1": points1, "geometry2": points2, "attribute": [1, 
2, 3]}
+        df = GeoDataFrame(data)
+
+        # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+        with self.ps_allow_diff_frames():
+            df = df.set_geometry("geometry1")
+        assert df.geometry.name == df.active_geometry_name == "geometry1"
+
+        # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+        with self.ps_allow_diff_frames():
+            df.set_geometry("geometry2", inplace=True)
+        assert df.geometry.name == df.active_geometry_name == "geometry2"
+
+    def test_rename_geometry(self):
+        points1 = [Point(x, x) for x in range(3)]
+        points2 = [Point(x + 5, x + 5) for x in range(3)]
+
+        data = {"geometry1": points1, "geometry2": points2, "attribute": [1, 
2, 3]}
+        df = GeoDataFrame(data)
+
+        # TODO: Try to optimize all of these with self.ps_allow_diff_frames() 
calls away
+        with self.ps_allow_diff_frames():
+            df = df.set_geometry("geometry1")
+        assert df.geometry.name == "geometry1"
+
+        with self.ps_allow_diff_frames():
+            df = df.rename_geometry("geometry3")
+        assert df.geometry.name == "geometry3"
+
+        # test inplace rename
+        with self.ps_allow_diff_frames():
+            df.rename_geometry("geometry4", inplace=True)
+        assert df.geometry.name == "geometry4"
+
     def test_area(self):
         # Create a GeoDataFrame with polygons to test area calculation
         from shapely.geometry import Polygon
diff --git a/python/tests/geopandas/test_geopandas_base.py 
b/python/tests/geopandas/test_geopandas_base.py
new file mode 100644
index 0000000000..824a53d1b4
--- /dev/null
+++ b/python/tests/geopandas/test_geopandas_base.py
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tests.test_base import TestBase
+from sedona.geopandas import GeoDataFrame, GeoSeries
+import pyspark.sql
+import geopandas as gpd
+import pandas as pd
+import pyspark.pandas as ps
+from pandas.testing import assert_series_equal
+from contextlib import contextmanager
+
+
+class TestGeopandasBase(TestBase):
+    # 
-----------------------------------------------------------------------------
+    # # Utils
+    # 
-----------------------------------------------------------------------------
+
+    @classmethod
+    def check_sgpd_equals_spark_df(
+        cls, actual: GeoSeries, expected: pyspark.sql.DataFrame
+    ):
+        assert isinstance(actual, GeoSeries)
+        assert isinstance(expected, pyspark.sql.DataFrame)
+        expected = expected.selectExpr("ST_AsText(expected) as expected")
+        sgpd_result = actual.to_geopandas()
+        expected = expected.toPandas()["expected"]
+        for a, e in zip(sgpd_result, expected):
+            cls.assert_geometry_almost_equal(a, e)
+
+    # TODO chore: rename to check_sgpd_series_equals_gpd_series and change the 
names in the geoseries tests
+    @classmethod
+    def check_sgpd_equals_gpd(cls, actual: GeoSeries, expected: gpd.GeoSeries):
+        assert isinstance(actual, GeoSeries)
+        assert isinstance(expected, gpd.GeoSeries)
+        sgpd_result = actual.to_geopandas()
+        for a, e in zip(sgpd_result, expected):
+            if a is None or e is None:
+                assert a is None and e is None
+                continue
+            # Sometimes sedona and geopandas both return empty geometries but 
of different types (e.g Point and Polygon)
+            elif a.is_empty and e.is_empty:
+                continue
+            cls.assert_geometry_almost_equal(
+                a, e, tolerance=1e-2
+            )  # increased tolerance from 1e-6
+
+    @classmethod
+    def check_sgpd_df_equals_gpd_df(
+        cls, actual: GeoDataFrame, expected: gpd.GeoDataFrame
+    ):
+        assert isinstance(actual, GeoDataFrame)
+        assert isinstance(expected, gpd.GeoDataFrame)
+        assert len(actual.columns) == len(expected.columns)
+        for col_name in actual.keys():
+            actual_series, expected_series = actual[col_name], 
expected[col_name]
+            if isinstance(actual_series, GeoSeries):
+                assert isinstance(actual_series, GeoSeries)
+                # original geopandas does not guarantee a GeoSeries will be 
returned, so convert it here
+                expected_series = gpd.GeoSeries(expected_series)
+                cls.check_sgpd_equals_gpd(actual_series, expected_series)
+            else:
+                assert isinstance(actual_series, ps.Series)
+                assert isinstance(expected_series, pd.Series)
+                cls.check_pd_series_equal(actual_series, expected_series)
+
+    @classmethod
+    def check_pd_series_equal(cls, actual: ps.Series, expected: pd.Series):
+        assert isinstance(actual, ps.Series)
+        assert isinstance(expected, pd.Series)
+        assert_series_equal(actual.to_pandas(), expected)
+
+    @contextmanager
+    def ps_allow_diff_frames(self):
+        """
+        A context manager to temporarily set a compute.ops_on_diff_frames 
option.
+        """
+        try:
+            ps.set_option("compute.ops_on_diff_frames", True)
+
+            # Yield control to the code inside the 'with' block
+            yield
+        finally:
+            ps.reset_option("compute.ops_on_diff_frames")
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index 352f73e766..dbbe53fdb4 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -22,7 +22,7 @@ import pandas as pd
 import geopandas as gpd
 import sedona.geopandas as sgpd
 from sedona.geopandas import GeoSeries
-from tests.test_base import TestBase
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
 from shapely import wkt
 from shapely.geometry import (
     Point,
@@ -38,7 +38,7 @@ from pandas.testing import assert_series_equal
 import pytest
 
 
-class TestGeoSeries(TestBase):
+class TestGeoSeries(TestGeopandasBase):
     def setup_method(self):
         self.geoseries = sgpd.GeoSeries(
             [
@@ -55,19 +55,6 @@ class TestGeoSeries(TestBase):
             ]
         )
 
-    def check_sgpd_equals_gpd(self, actual: sgpd.GeoSeries, expected: 
gpd.GeoSeries):
-        assert isinstance(actual, sgpd.GeoSeries)
-        assert isinstance(expected, gpd.GeoSeries)
-        assert len(actual) == len(expected)
-        sgpd_result = actual.to_geopandas()
-        for a, e in zip(sgpd_result, expected):
-            if a is None or e is None:
-                assert a is None and e is None
-                continue
-            elif a.is_empty and e.is_empty:
-                continue
-            self.assert_geometry_almost_equal(a, e)
-
     def test_area(self):
         result = self.geoseries.area.to_pandas()
         expected = pd.Series([0.0, 0.0, 5.23, 5.23])
diff --git a/python/tests/geopandas/test_match_geopandas_dataframe.py 
b/python/tests/geopandas/test_match_geopandas_dataframe.py
new file mode 100644
index 0000000000..acc7487431
--- /dev/null
+++ b/python/tests/geopandas/test_match_geopandas_dataframe.py
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import shutil
+import tempfile
+from shapely.geometry import (
+    Point,
+    Polygon,
+    MultiPoint,
+    MultiLineString,
+    LineString,
+    MultiPolygon,
+    GeometryCollection,
+    LinearRing,
+)
+
+from packaging.version import parse as parse_version
+from sedona.geopandas import GeoDataFrame, GeoSeries
+import geopandas as gpd
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
+import pyspark.pandas as ps
+
+
+class TestMatchGeopandasDataFrame(TestGeopandasBase):
+    def setup_method(self):
+        self.tempdir = tempfile.mkdtemp()
+
+        rows = 3
+
+        self.points = [Point(x, x + 1, x + 2) for x in range(rows)]
+
+        self.multipoints = [
+            MultiPoint([(x, x + 1), (x + 2, x + 3)]) for x in range(rows)
+        ]
+
+        self.linestrings = [
+            LineString([(x, x + 1), (x + 2, x + 3)]) for x in range(rows)
+        ]
+
+        self.multilinestrings = [
+            MultiLineString(
+                [[[x, x + 1], [x + 2, x + 3]], [[x + 4, x + 5], [x + 6, x + 
7]]]
+            )
+            for x in range(rows)
+        ]
+
+        self.polygons = [
+            Polygon(
+                [(x, 0, x + 2), (x + 1, 0, x + 3), (x + 2, 1, x + 4), (x + 3, 
1, x + 5)]
+            )
+            for x in range(rows)
+        ]
+
+        self.multipolygons = [
+            MultiPolygon(
+                [
+                    (
+                        [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+                        [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1, 0.1)]],
+                    )
+                ]
+            )
+            for x in range(rows)
+        ]
+
+        self.geomcollection = [
+            GeometryCollection(
+                [
+                    MultiPoint([(0, 0), (1, 1)]),
+                    MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
+                    MultiPolygon(
+                        [
+                            (
+                                [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+                                [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1, 
0.1)]],
+                            )
+                        ]
+                    ),
+                ]
+            )
+            for x in range(rows)
+        ]
+
+        self.geometries = {
+            "points": self.points,
+            "multipoints": self.multipoints,
+            "linestrings": self.linestrings,
+            "multilinestrings": self.multilinestrings,
+            "polygons": self.polygons,
+            "multipolygons": self.multipolygons,
+            "geomcollection": self.geomcollection,
+        }
+
+    def teardown_method(self):
+        shutil.rmtree(self.tempdir)
+
+    def test_getitem(self):
+        sgpd_df = GeoDataFrame(self.geometries)
+        gpd_df = gpd.GeoDataFrame(self.geometries)
+        for key in self.geometries.keys():
+            actual_series, expected_series = sgpd_df[key], gpd_df[key]
+            if isinstance(actual_series, GeoSeries):
+                # original geopandas does not guarantee a GeoSeries will be 
returned, so convert it here
+                expected_series = gpd.GeoSeries(expected_series)
+                self.check_sgpd_equals_gpd(actual_series, expected_series)
+            else:
+                self.check_pd_series_equal(actual_series, expected_series)  # 
type: ignore
+
+        self.check_sgpd_df_equals_gpd_df(sgpd_df, gpd_df)
+
+    def test_set_geometry(self):
+        sgpd_df = GeoDataFrame(self.geometries)
+        gpd_df = gpd.GeoDataFrame(self.geometries)
+
+        with self.ps_allow_diff_frames():
+            sgpd_df = sgpd_df.set_geometry("points")
+        gpd_df = gpd_df.set_geometry("points")
+        assert sgpd_df.geometry.name == gpd_df.geometry.name
+
+        self.check_sgpd_df_equals_gpd_df(sgpd_df, gpd_df)
+
+    def test_active_geometry_name(self):
+        if parse_version(gpd.__version__) < parse_version("1.0.0"):
+            return
+
+        sgpd_df = GeoDataFrame(self.geometries)
+        gpd_df = gpd.GeoDataFrame(self.geometries)
+
+        with self.ps_allow_diff_frames():
+            sgpd_df = sgpd_df.set_geometry("polygons")
+        gpd_df = gpd_df.set_geometry("polygons")
+        assert sgpd_df.geometry.name == gpd_df.geometry.name
+        assert (
+            sgpd_df.active_geometry_name
+            == gpd_df.active_geometry_name
+            == sgpd_df.geometry.name
+        )
+
+    def test_rename_geometry(self):
+        sgpd_df = GeoDataFrame(self.geometries)
+        gpd_df = gpd.GeoDataFrame(self.geometries)
+
+        with self.ps_allow_diff_frames():
+            sgpd_df = sgpd_df.set_geometry("polygons")
+        gpd_df = gpd_df.set_geometry("polygons")
+        assert sgpd_df.geometry.name == gpd_df.geometry.name
+
+        # test inplace
+        with self.ps_allow_diff_frames():
+            sgpd_df.rename_geometry("random", inplace=True)
+        gpd_df.rename_geometry("random", inplace=True)
+        assert sgpd_df.geometry.name == gpd_df.geometry.name
+
+        # Ensure the names are different when we rename to different names
+        with self.ps_allow_diff_frames():
+            sgpd_df = sgpd_df.rename_geometry("name1")
+        gpd_df = gpd_df.rename_geometry("name2")
+        assert sgpd_df.geometry.name != gpd_df.geometry.name
diff --git a/python/tests/geopandas/test_match_geopandas_series.py 
b/python/tests/geopandas/test_match_geopandas_series.py
index a03e1394d1..6e38992f67 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -36,12 +36,12 @@ from shapely.geometry import (
 )
 
 from sedona.geopandas import GeoSeries
-from tests.test_base import TestBase
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
 import pyspark.pandas as ps
 from packaging.version import parse as parse_version
 
 
-class TestMatchGeopandasSeries(TestBase):
+class TestMatchGeopandasSeries(TestGeopandasBase):
     def setup_method(self):
         self.tempdir = tempfile.mkdtemp()
         self.t1 = Polygon([(0, 0), (1, 0), (1, 1)])
@@ -412,7 +412,7 @@ class TestMatchGeopandasSeries(TestBase):
 
     def test_is_valid_reason(self):
         # is_valid_reason was added in geopandas 1.0.0
-        if gpd.__version__ < "1.0.0":
+        if parse_version(gpd.__version__) < parse_version("1.0.0"):
             return
         data = [
             Polygon([(0, 0), (1, 1), (0, 1)]),
@@ -704,38 +704,3 @@ class TestMatchGeopandasSeries(TestBase):
             sgpd_series = sgpd_series.set_crs(epsg=3857, allow_override=True)
             gpd_series = gpd_series.set_crs(epsg=3857, allow_override=True)
             assert sgpd_series.crs == gpd_series.crs
-
-    # 
-----------------------------------------------------------------------------
-    # # Utils
-    # 
-----------------------------------------------------------------------------
-
-    def check_sgpd_equals_spark_df(
-        self, actual: GeoSeries, expected: pyspark.sql.DataFrame
-    ):
-        assert isinstance(actual, GeoSeries)
-        assert isinstance(expected, pyspark.sql.DataFrame)
-        expected = expected.selectExpr("ST_AsText(expected) as expected")
-        sgpd_result = actual.to_geopandas()
-        expected = expected.toPandas()["expected"]
-        for a, e in zip(sgpd_result, expected):
-            self.assert_geometry_almost_equal(a, e)
-
-    def check_sgpd_equals_gpd(self, actual: GeoSeries, expected: 
gpd.GeoSeries):
-        assert isinstance(actual, GeoSeries)
-        assert isinstance(expected, gpd.GeoSeries)
-        sgpd_result = actual.to_geopandas()
-        for a, e in zip(sgpd_result, expected):
-            if a is None or e is None:
-                assert a is None and e is None
-                continue
-            # Sometimes sedona and geopandas both return empty geometries but 
of different types (e.g Point and Polygon)
-            elif a.is_empty and e.is_empty:
-                continue
-            self.assert_geometry_almost_equal(
-                a, e, tolerance=1e-2
-            )  # increased tolerance from 1e-6
-
-    def check_pd_series_equal(self, actual: ps.Series, expected: pd.Series):
-        assert isinstance(actual, ps.Series)
-        assert isinstance(expected, pd.Series)
-        assert_series_equal(actual.to_pandas(), expected)

(sedona) branch master updated: [GH-2006] Geopandas: set_geometry, rename_geometry, active_geometry_name + refactor tests (#2058)

Reply via email to