This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new da82434b93 [GH-2006] Geopandas: set_geometry, rename_geometry,
active_geometry_name + refactor tests (#2058)
da82434b93 is described below
commit da82434b9353fd1e79a32dae9e9832b92140f01a
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Jul 15 11:52:55 2025 -0700
[GH-2006] Geopandas: set_geometry, rename_geometry, active_geometry_name +
refactor tests (#2058)
* Implement basic set_geometry and refactor tests
* Skip active_geometry_name for old versions
* Implement rename_geometry and update correct version for
active_geometry_name
* Fix version skipping logic in tests
* Clean up
* Fix tests intersection, from_*, and set_geometry after the merge
* Implement rest of logic to pass the crs tests in set_geometry
* clean up
* Fix some tests (all except rename_geometry)
* Fixes to pass tests
* Remove check for option already set
* Clean up unused (commented) setup method
---
python/sedona/geopandas/base.py | 5 -
python/sedona/geopandas/geodataframe.py | 479 ++++++++++++++++++---
python/sedona/geopandas/geoseries.py | 27 +-
python/tests/geopandas/test_geodataframe.py | 123 +++++-
python/tests/geopandas/test_geopandas_base.py | 98 +++++
python/tests/geopandas/test_geoseries.py | 17 +-
.../geopandas/test_match_geopandas_dataframe.py | 173 ++++++++
.../tests/geopandas/test_match_geopandas_series.py | 41 +-
8 files changed, 825 insertions(+), 138 deletions(-)
diff --git a/python/sedona/geopandas/base.py b/python/sedona/geopandas/base.py
index 8bc1ad4dd3..65974efbff 100644
--- a/python/sedona/geopandas/base.py
+++ b/python/sedona/geopandas/base.py
@@ -60,11 +60,6 @@ class GeoFrame(metaclass=ABCMeta):
) -> Union["GeoSeries", Scalar]:
raise NotImplementedError("This method is not implemented yet.")
- @property
- @abstractmethod
- def dtypes(self) -> Union[gpd.GeoSeries, pd.Series, Dtype]:
- raise NotImplementedError("This method is not implemented yet.")
-
@abstractmethod
def to_geopandas(self) -> Union[gpd.GeoDataFrame, pd.Series]:
raise NotImplementedError("This method is not implemented yet.")
diff --git a/python/sedona/geopandas/geodataframe.py
b/python/sedona/geopandas/geodataframe.py
index b627d06a7d..0305153ec0 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -16,11 +16,16 @@
# under the License.
from __future__ import annotations
-from typing import Any
+from typing import Any, Literal, Callable, Union
+import typing
+import warnings
+import numpy as np
+import shapely
import geopandas as gpd
import pandas as pd
import pyspark.pandas as pspd
+import sedona.geopandas as sgpd
from pyspark.pandas import Series as PandasOnSparkSeries
from pyspark.pandas._typing import Dtype
from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
@@ -30,6 +35,13 @@ from sedona.geopandas._typing import Label
from sedona.geopandas.base import GeoFrame
from sedona.geopandas.sindex import SpatialIndex
+from pandas.api.extensions import register_extension_dtype
+from geopandas.geodataframe import crs_mismatch_error
+from geopandas.array import GeometryDtype
+from shapely.geometry.base import BaseGeometry
+
+register_extension_dtype(GeometryDtype)
+
class GeoDataFrame(GeoFrame, pspd.DataFrame):
"""
@@ -64,7 +76,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
1 2
Name: value, dtype: int64
"""
- from sedona.geopandas import GeoSeries
# Handle column access by name
if isinstance(key, str):
@@ -75,27 +86,15 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
if column_name not in self.columns:
raise KeyError(f"Column '{column_name}' does not exist")
- # Get column data from spark_frame
- spark_df = self._internal.spark_frame.select(column_name)
- pandas_df = spark_df.toPandas()
-
- # Check if this is a geometry column
- field = next(
- (f for f in self._internal.spark_frame.schema.fields if f.name
== key),
- None,
- )
-
- if field and (
- field.dataType.typeName() == "geometrytype"
- or field.dataType.typeName() == "binary"
- ):
- # Return as GeoSeries for geometry columns
- return GeoSeries(pandas_df[column_name])
- else:
- # Return as regular pandas Series for non-geometry columns
- from pyspark.pandas import Series
+ # Here we are getting a ps.Series with the same underlying anchor
(ps.Dataframe).
+ # This is important so we don't unnecessarily try to perform
operations on different dataframes
+ ps_series = pspd.DataFrame.__getitem__(self, column_name)
- return Series(pandas_df[column_name])
+ try:
+ result = sgpd.GeoSeries(ps_series)
+ return result
+ except:
+ return ps_series
# Handle list of column names
elif isinstance(key, list) and all(isinstance(k, str) for k in key):
@@ -119,6 +118,8 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
selected_rows = pandas_df[key]
return GeoDataFrame(selected_rows)
+ _geometry_column_name = None
+
def __init__(
self,
data=None,
@@ -138,19 +139,65 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
from sedona.geopandas import GeoSeries
from pyspark.sql import DataFrame as SparkDataFrame
- if isinstance(data, (GeoDataFrame, GeoSeries)):
- assert dtype is None
- assert not copy
- super().__init__(data, index=index, dtype=dtype, copy=copy)
- elif isinstance(data, (PandasOnSparkSeries, PandasOnSparkDataFrame)):
- assert columns is None
- assert dtype is None
- assert not copy
- super().__init__(data, index=index, dtype=dtype)
+ # Simplified version of the function from GeoSeries.__init__()
+ def try_geom_to_ewkb(x) -> bytes:
+ if isinstance(x, BaseGeometry):
+ kwargs = {}
+ if crs:
+ from pyproj import CRS
+
+ srid = CRS.from_user_input(crs)
+ kwargs["srid"] = srid.to_epsg()
+
+ return shapely.wkb.dumps(x, **kwargs)
+ elif isinstance(x, bytearray):
+ return bytes(x)
+ elif x is None or isinstance(x, bytes):
+ return x
+ else:
+ raise TypeError(f"expected geometry or bytes, got {type(x)}:
{x}")
+
+ def safe_apply_to_each_column(
+ df: pd.DataFrame | pspd.DataFrame, func: Callable
+ ):
+ # try except won't work here for ps.DataFrame case, so we use
first_valid_index() instead
+ for col in df.columns:
+ first_idx = df[col].first_valid_index()
+ if first_idx is not None and isinstance(
+ df[col][first_idx], BaseGeometry
+ ):
+ df[col] = df[col].apply(try_geom_to_ewkb)
+ return df
+
+ if isinstance(data, GeoDataFrame):
+ if data._safe_get_crs() is None:
+ data.crs = crs
+ elif isinstance(data, GeoSeries):
+ if data.crs is None:
+ data.crs = crs
+
+ # For each of these super().__init__() calls, we let pyspark
decide which inputs are valid or not
+ # instead of calling e.g assert not dtype ourselves.
+ # This way, if Spark adds support later, than we inherit those
changes naturally
+ super().__init__(data, index=index, columns=columns, dtype=dtype,
copy=copy)
+ elif isinstance(data, PandasOnSparkDataFrame):
+
+ data = safe_apply_to_each_column(data, try_geom_to_ewkb)
+
+ super().__init__(data, index=index, columns=columns, dtype=dtype,
copy=copy)
+ elif isinstance(data, PandasOnSparkSeries):
+
+ try:
+ data = GeoSeries(data)
+ except TypeError:
+ pass
+
+ super().__init__(data, index=index, columns=columns, dtype=dtype,
copy=copy)
elif isinstance(data, SparkDataFrame):
assert columns is None
assert dtype is None
assert not copy
+
if index is None:
internal = InternalFrame(spark_frame=data,
index_spark_columns=None)
object.__setattr__(self, "_internal_frame", internal)
@@ -170,15 +217,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
)
gdf = gpd.GeoDataFrame(df)
# convert each geometry column to wkb type
- import shapely
- for col in gdf.columns:
- # It's possible we get a list, dict, pd.Series, gpd.GeoSeries,
etc of shapely.Geometry objects.
- if len(gdf[col]) > 0 and isinstance(
- gdf[col].iloc[0], shapely.geometry.base.BaseGeometry
- ):
- gdf[col] = gdf[col].apply(lambda geom: geom.wkb)
+ # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc
of shapely.Geometry objects.
+ gdf = safe_apply_to_each_column(gdf, try_geom_to_ewkb)
+
pdf = pd.DataFrame(gdf)
+
# initialize the parent class pyspark Dataframe with the pandas
Series
super().__init__(
data=pdf,
@@ -187,6 +231,315 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
copy=copy,
)
+ if isinstance(data, (GeoDataFrame, gpd.GeoDataFrame)):
+ self._geometry_column_name = data._geometry_column_name
+ if crs is not None and data.crs != crs:
+ raise ValueError(crs_mismatch_error)
+
+ def _get_geometry(self) -> sgpd.GeoSeries:
+ if self._geometry_column_name not in self:
+ if self._geometry_column_name is None:
+ msg = (
+ "You are calling a geospatial method on the GeoDataFrame, "
+ "but the active geometry column to use has not been set. "
+ )
+ else:
+ msg = (
+ "You are calling a geospatial method on the GeoDataFrame, "
+ f"but the active geometry column
('{self._geometry_column_name}') "
+ "is not present. "
+ )
+ geo_cols = list(self.columns[self.dtypes == "geometry"])
+ if len(geo_cols) > 0:
+ msg += (
+ f"\nThere are columns with geometry data type
({geo_cols}), and "
+ "you can either set one as the active geometry with "
+ 'df.set_geometry("name") or access the column as a '
+ 'GeoSeries (df["name"]) and call the method directly on
it.'
+ )
+ else:
+ msg += (
+ "\nThere are no existing columns with geometry data type.
You can "
+ "add a geometry column as the active geometry column with "
+ "df.set_geometry. "
+ )
+
+ raise AttributeError(msg)
+ return self[self._geometry_column_name]
+
+ def _set_geometry(self, col):
+ # This check is included in the original geopandas. Note that this
prevents assigning a str to the property
+ # e.g. df.geometry = "geometry"
+ # However the user can still use specify a str in the public
.set_geometry() method
+ # ie. df.geometry = "geometry1" errors, but
df.set_geometry("geometry1") works
+ if not pd.api.types.is_list_like(col):
+ raise ValueError("Must use a list-like to set the geometry
property")
+ self.set_geometry(col, inplace=True)
+
+ geometry = property(
+ fget=_get_geometry, fset=_set_geometry, doc="Geometry data for
GeoDataFrame"
+ )
+
+ @typing.overload
+ def set_geometry(
+ self,
+ col,
+ drop: bool | None = ...,
+ inplace: Literal[True] = ...,
+ crs: Any | None = ...,
+ ) -> None: ...
+
+ @typing.overload
+ def set_geometry(
+ self,
+ col,
+ drop: bool | None = ...,
+ inplace: Literal[False] = ...,
+ crs: Any | None = ...,
+ ) -> GeoDataFrame: ...
+
+ def set_geometry(
+ self,
+ col,
+ drop: bool | None = None,
+ inplace: bool = False,
+ crs: Any | None = None,
+ ) -> GeoDataFrame | None:
+ """
+ Set the GeoDataFrame geometry using either an existing column or
+ the specified input. By default yields a new object.
+
+ The original geometry column is replaced with the input.
+
+ Parameters
+ ----------
+ col : column label or array-like
+ An existing column name or values to set as the new geometry
column.
+ If values (array-like, (Geo)Series) are passed, then if they are
named
+ (Series) the new geometry column will have the corresponding name,
+ otherwise the existing geometry column will be replaced. If there
is
+ no existing geometry column, the new geometry column will use the
+ default name "geometry".
+ drop : boolean, default False
+ When specifying a named Series or an existing column name for
`col`,
+ controls if the previous geometry column should be dropped from the
+ result. The default of False keeps both the old and new geometry
column.
+
+ .. deprecated:: 1.0.0
+
+ inplace : boolean, default False
+ Modify the GeoDataFrame in place (do not create a new object)
+ crs : pyproj.CRS, optional
+ Coordinate system to use. The value can be anything accepted
+ by :meth:`pyproj.CRS.from_user_input()
<pyproj.crs.CRS.from_user_input>`,
+ such as an authority string (eg "EPSG:4326") or a WKT string.
+ If passed, overrides both DataFrame and col's crs.
+ Otherwise, tries to get crs from passed col values or DataFrame.
+
+ Examples
+ --------
+ >>> from sedona.geopandas import GeoDataFrame
+ >>> from shapely.geometry import Point
+ >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2),
Point(2, 1)]}
+ >>> gdf = GeoDataFrame(d, crs="EPSG:4326")
+ >>> gdf
+ col1 geometry
+ 0 name1 POINT (1 2)
+ 1 name2 POINT (2 1)
+
+ Passing an array:
+
+ >>> df1 = gdf.set_geometry([Point(0,0), Point(1,1)])
+ >>> df1
+ col1 geometry
+ 0 name1 POINT (0 0)
+ 1 name2 POINT (1 1)
+
+ Using existing column:
+
+ >>> gdf["buffered"] = gdf.buffer(2)
+ >>> df2 = gdf.set_geometry("buffered")
+ >>> df2.geometry
+ 0 POLYGON ((3 2, 2.99037 1.80397, 2.96157 1.6098...
+ 1 POLYGON ((4 1, 3.99037 0.80397, 3.96157 0.6098...
+ Name: buffered, dtype: geometry
+
+ Returns
+ -------
+ GeoDataFrame
+
+ See also
+ --------
+ GeoDataFrame.rename_geometry : rename an active geometry column
+ """
+ # Most of the code here is taken from DataFrame.set_index()
+ if inplace:
+ frame = self
+ else:
+ frame = self.copy(deep=False)
+
+ geo_column_name = self._geometry_column_name
+
+ if geo_column_name is None:
+ geo_column_name = "geometry"
+
+ if isinstance(
+ col, (pspd.Series, pd.Series, list, np.ndarray,
gpd.array.GeometryArray)
+ ):
+ if drop:
+ msg = (
+ "The `drop` keyword argument is deprecated and has no
effect when "
+ "`col` is an array-like value. You should stop passing
`drop` to "
+ "`set_geometry` when this is the case."
+ )
+ warnings.warn(msg, category=FutureWarning, stacklevel=2)
+ if isinstance(col, (pspd.Series, pd.Series)) and col.name is not
None:
+ geo_column_name = col.name
+
+ level = col
+
+ if not isinstance(level, pspd.Series):
+ level = pspd.Series(level)
+ elif hasattr(col, "ndim") and col.ndim > 1:
+ raise ValueError("Must pass array with one dimension only.")
+ else: # should be a colname
+ try:
+ level = frame[col]
+ except KeyError:
+ raise ValueError(f"Unknown column {col}")
+ if isinstance(level, (sgpd.GeoDataFrame, gpd.GeoDataFrame)):
+ raise ValueError(
+ "GeoDataFrame does not support setting the geometry column
where "
+ "the column name is shared by multiple columns."
+ )
+
+ given_colname_drop_msg = (
+ "The `drop` keyword argument is deprecated and in future the
only "
+ "supported behaviour will match drop=False. To silence this "
+ "warning and adopt the future behaviour, stop providing "
+ "`drop` as a keyword to `set_geometry`. To replicate the "
+ "`drop=True` behaviour you should update "
+ "your code to\n`geo_col_name = gdf.active_geometry_name;"
+ " gdf.set_geometry(new_geo_col).drop("
+ "columns=geo_col_name).rename_geometry(geo_col_name)`."
+ )
+
+ if drop is False: # specifically False, not falsy i.e. None
+ # User supplied False explicitly, but arg is deprecated
+ warnings.warn(
+ given_colname_drop_msg,
+ category=FutureWarning,
+ stacklevel=2,
+ )
+ if drop:
+ raise NotImplementedError("Not implemented.")
+ else:
+ # if not dropping, set the active geometry name to the given
col name
+ geo_column_name = col
+
+ if not crs:
+ crs = getattr(level, "crs", None)
+
+ # Check that we are using a listlike of geometries
+ level = _ensure_geometry(level, crs=crs)
+ # ensure_geometry only sets crs on level if it has crs==None
+
+ # This operation throws a warning to the user asking them to set
pspd.set_option('compute.ops_on_diff_frames', True)
+ # to allow operations on different frames. We pass these warnings on
to the user so they must manually set it themselves.
+ if level.crs != crs:
+ level.set_crs(crs, inplace=True, allow_override=True)
+
+ frame._geometry_column_name = geo_column_name
+ frame[geo_column_name] = level
+
+ if not inplace:
+ return frame
+
+ @typing.overload
+ def rename_geometry(
+ self,
+ col: str,
+ inplace: Literal[True] = ...,
+ ) -> None: ...
+
+ @typing.overload
+ def rename_geometry(
+ self,
+ col: str,
+ inplace: Literal[False] = ...,
+ ) -> GeoDataFrame: ...
+
+ def rename_geometry(self, col: str, inplace: bool = False) -> GeoDataFrame
| None:
+ """
+ Renames the GeoDataFrame geometry column to
+ the specified name. By default yields a new object.
+
+ The original geometry column is replaced with the input.
+
+ Parameters
+ ----------
+ col : new geometry column label
+ inplace : boolean, default False
+ Modify the GeoDataFrame in place (without creating a new object)
+
+ Examples
+ --------
+ >>> from sedona.geopandas import GeoDataFrame
+ >>> from shapely.geometry import Point
+ >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2),
Point(2, 1)]}
+ >>> df = GeoDataFrame(d, crs="EPSG:4326")
+ >>> df1 = df.rename_geometry('geom1')
+ >>> df1.geometry.name
+ 'geom1'
+ >>> df.rename_geometry('geom1', inplace=True)
+ >>> df.geometry.name
+ 'geom1'
+
+
+ See also
+ --------
+ GeoDataFrame.set_geometry : set the active geometry
+ """
+ geometry_col = self.geometry.name
+ if col in self.columns:
+ raise ValueError(f"Column named {col} already exists")
+ else:
+ if inplace:
+ self.rename(columns={geometry_col: col}, inplace=inplace)
+ self.set_geometry(col, inplace=inplace)
+ return None
+
+ # The same .rename().set_geometry() logic errors for this case, so
we do it manually instead
+ ps_series = self._psser_for((geometry_col,)).rename(col)
+ sdf = self.copy()
+ sdf[col] = ps_series
+ sdf = sdf.set_geometry(col)
+ return sdf
+
+ @property
+ def active_geometry_name(self) -> Any:
+ """Return the name of the active geometry column
+
+ Returns a name if a GeoDataFrame has an active geometry column set,
+ otherwise returns None. The return type is usually a string, but may be
+ an integer, tuple or other hashable, depending on the contents of the
+ dataframe columns.
+
+ You can also access the active geometry column using the
+ ``.geometry`` property. You can set a GeoSeries to be an active
geometry
+ using the :meth:`~GeoDataFrame.set_geometry` method.
+
+ Returns
+ -------
+ str or other index label supported by pandas
+ name of an active geometry column or None
+
+ See also
+ --------
+ GeoDataFrame.set_geometry : set the active geometry
+ """
+ return self._geometry_column_name
+
def _process_geometry_columns(
self, operation: str, rename_suffix: str = "", *args, **kwargs
) -> GeoDataFrame:
@@ -236,11 +589,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
sdf = self._internal.spark_frame.selectExpr(*select_expressions)
return GeoDataFrame(sdf)
- @property
- def dtypes(self) -> gpd.GeoSeries | pd.Series | Dtype:
- # Implementation of the abstract method
- raise NotImplementedError("This method is not implemented yet.")
-
def to_geopandas(self) -> gpd.GeoDataFrame | pd.Series:
# Implementation of the abstract method
raise NotImplementedError("This method is not implemented yet.")
@@ -291,7 +639,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
self._anchor.copy(), dtype=self.dtypes, index=self._col_label
)
else:
- return self
+ return self # GeoDataFrame(self._internal.spark_frame.copy())
"this parameter is not supported but just dummy parameter to match pandas."
@property
def area(self) -> GeoDataFrame:
@@ -320,15 +668,26 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
"""
return self._process_geometry_columns("ST_Area", rename_suffix="_area")
+ def _safe_get_crs(self):
+ """
+ Helper method for getting the crs of the GeoDataframe safely.
+ Returns None if no geometry column is set instead of raising an error.
+ """
+ try:
+ return self.geometry.crs
+ except AttributeError:
+ return None
+
@property
def crs(self):
- # Implementation of the abstract method
- raise NotImplementedError("This method is not implemented yet.")
+ return self.geometry.crs
@crs.setter
def crs(self, value):
- # Implementation of the abstract method
- raise NotImplementedError("This method is not implemented yet.")
+ # Avoid trying to access the geometry column (which might be missing)
if crs is None
+ if value is None:
+ return
+ self.geometry.crs = value
@property
def geom_type(self):
@@ -666,3 +1025,27 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
"""
# Use the Spark DataFrame's write method to write to GeoParquet format
self._internal.spark_frame.write.format("geoparquet").save(path,
**kwargs)
+
+
+# -----------------------------------------------------------------------------
+# # Utils
+# -----------------------------------------------------------------------------
+
+
+def _ensure_geometry(data, crs: Any | None = None) -> sgpd.GeoSeries:
+ """
+ Ensure the data is of geometry dtype or converted to it.
+
+ If input is a (Geo)Series, output is a GeoSeries, otherwise output
+ is GeometryArray.
+
+ If the input is a GeometryDtype with a set CRS, `crs` is ignored.
+ """
+ if isinstance(data, sgpd.GeoSeries):
+ if data.crs is None and crs is not None:
+ # Avoids caching issues/crs sharing issues
+ data = data.copy()
+ data.crs = crs
+ return data
+ else:
+ return sgpd.GeoSeries(data, crs=crs)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index a94302d89b..821e1ab324 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -117,7 +117,10 @@ class GeoSeries(GeoFrame, pspd.Series):
self._anchor: GeoDataFrame
self._col_label: Label
+ use_same_anchor = True
+
def try_geom_to_ewkb(x) -> bytes:
+ nonlocal use_same_anchor
if isinstance(x, BaseGeometry):
kwargs = {}
if crs:
@@ -125,9 +128,11 @@ class GeoSeries(GeoFrame, pspd.Series):
srid = CRS.from_user_input(crs)
kwargs["srid"] = srid.to_epsg()
+ use_same_anchor = False
return shapely.wkb.dumps(x, **kwargs)
elif isinstance(x, bytearray):
+ use_same_anchor = False
return bytes(x)
elif x is None or isinstance(x, bytes):
return x
@@ -172,6 +177,9 @@ class GeoSeries(GeoFrame, pspd.Series):
copy=copy,
fastpath=fastpath,
)
+
+ if use_same_anchor:
+ self._anchor = data
else:
if isinstance(data, pd.Series):
assert index is None
@@ -472,14 +480,8 @@ class GeoSeries(GeoFrame, pspd.Series):
if isinstance(data_type, BinaryType):
query = query.replace(f"`{cols}`", f"ST_GeomFromWKB(`{cols}`)")
- # Convert back to EWKB format if the return type is a geometry
- if returns_geom:
- query = f"ST_AsEWKB({query})"
-
rename = col if not rename else rename
- query = f"{query} as `{rename}`"
-
elif isinstance(cols, list):
for col in cols:
data_type = df.schema[col].dataType
@@ -491,7 +493,11 @@ class GeoSeries(GeoFrame, pspd.Series):
# must have rename for multiple columns since we don't know which
name to default to
assert rename
- query = f"{query} as `{rename}`"
+ # Convert back to EWKB format if the return type is a geometry
+ if returns_geom:
+ query = f"ST_AsEWKB({query})"
+
+ query = f"{query} as `{rename}`"
# We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to
regenerate it in the result
# We always select SPARK_DEFAULT_INDEX_NAME, to retain series index
info
@@ -509,11 +515,6 @@ class GeoSeries(GeoFrame, pspd.Series):
return GeoSeries(ps_series) if returns_geom else ps_series
- @property
- def dtypes(self) -> Union[gpd.GeoSeries, pd.Series, Dtype]:
- # Implementation of the abstract method
- raise NotImplementedError("This method is not implemented yet.")
-
def to_geopandas(self) -> gpd.GeoSeries:
"""
Convert the GeoSeries to a geopandas GeoSeries.
@@ -2090,7 +2091,7 @@ class GeoSeries(GeoFrame, pspd.Series):
if isinstance(data, list) and not isinstance(data[0], (tuple, list)):
data = [(obj,) for obj in data]
- select = f"{select} as geometry"
+ select = f"ST_AsEWKB({select}) as geometry"
spark_df = default_session().createDataFrame(data, schema=schema)
spark_df = spark_df.selectExpr(select)
diff --git a/python/tests/geopandas/test_geodataframe.py
b/python/tests/geopandas/test_geodataframe.py
index 33e0041dc2..a6c208f694 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -22,30 +22,17 @@ from shapely.geometry import (
)
from sedona.geopandas import GeoDataFrame, GeoSeries
-from tests.test_base import TestBase
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
import pyspark.pandas as ps
import pandas as pd
import geopandas as gpd
import sedona.geopandas as sgpd
import pytest
-from pandas.testing import assert_frame_equal
-
-
-class TestDataframe(TestBase):
- # def setup_method(self):
- # N = 10
- # self.tempdir = tempfile.mkdtemp()
- # self.crs = "epsg:4326"
- # self.df = GeoDataFrame(
- # [
- # {"geometry": Point(x, y), "value1": x + y, "value2": x * y}
- # for x, y in zip(range(N), range(N))
- # ],
- # crs=self.crs,
- # )
- #
- # def teardown_method(self):
- # shutil.rmtree(self.tempdir)
+from pandas.testing import assert_frame_equal, assert_series_equal
+from packaging.version import parse as parse_version
+
+
+class TestDataframe(TestGeopandasBase):
@pytest.mark.parametrize(
"obj",
[
@@ -172,6 +159,104 @@ class TestDataframe(TestBase):
df_copy = df.copy()
assert type(df_copy) is GeoDataFrame
+ def test_set_geometry(self):
+ points1 = [Point(x, x) for x in range(3)]
+ points2 = [Point(x + 5, x + 5) for x in range(3)]
+
+ data = {"geometry1": points1, "geometry2": points2, "attribute": [1,
2, 3]}
+ sgpd_df = sgpd.GeoDataFrame(data)
+
+ # No geometry column set yet
+ with pytest.raises(AttributeError):
+ _ = sgpd_df.geometry
+
+ # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+ with self.ps_allow_diff_frames():
+ sgpd_df = sgpd_df.set_geometry("geometry1")
+
+ assert sgpd_df.geometry.name == "geometry1"
+
+ # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+ with self.ps_allow_diff_frames():
+ sgpd_df.set_geometry("geometry2", inplace=True)
+ assert sgpd_df.geometry.name == "geometry2"
+
+ # Test the actual values of the geometry column
+ assert_series_equal(
+ sgpd_df.geometry.area.to_pandas(),
sgpd_df["geometry2"].area.to_pandas()
+ )
+
+ # unknown column
+ with pytest.raises(ValueError):
+ sgpd_df.set_geometry("nonexistent-column")
+
+ geom = GeoSeries(
+ [Point(x, y) for x, y in zip(range(5), range(5))], name="geometry2"
+ )
+
+ # new crs - setting should default to GeoSeries' crs
+ gs = GeoSeries(geom, crs="epsg:3857")
+
+ with self.ps_allow_diff_frames():
+ new_df = sgpd_df.set_geometry(gs)
+
+ assert new_df.crs == "epsg:3857"
+
+ # explicit crs overrides self and dataframe
+ with self.ps_allow_diff_frames():
+ new_df = sgpd_df.set_geometry(gs, crs="epsg:26909")
+
+ assert new_df.crs == "epsg:26909"
+ assert new_df.geometry.crs == "epsg:26909"
+
+ # Series should use dataframe's crs
+ with self.ps_allow_diff_frames():
+ new_df = sgpd_df.set_geometry(geom.values)
+
+ assert new_df.crs == sgpd_df.crs
+ assert new_df.geometry.crs == sgpd_df.crs
+
+ def test_active_geometry_name(self):
+ if parse_version(gpd.__version__) < parse_version("1.0.0"):
+ return
+
+ points1 = [Point(x, x) for x in range(3)]
+ points2 = [Point(x + 5, x + 5) for x in range(3)]
+
+ data = {"geometry1": points1, "geometry2": points2, "attribute": [1,
2, 3]}
+ df = GeoDataFrame(data)
+
+ # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+ with self.ps_allow_diff_frames():
+ df = df.set_geometry("geometry1")
+ assert df.geometry.name == df.active_geometry_name == "geometry1"
+
+ # TODO: Try to optimize this with self.ps_allow_diff_frames() away
+ with self.ps_allow_diff_frames():
+ df.set_geometry("geometry2", inplace=True)
+ assert df.geometry.name == df.active_geometry_name == "geometry2"
+
+ def test_rename_geometry(self):
+ points1 = [Point(x, x) for x in range(3)]
+ points2 = [Point(x + 5, x + 5) for x in range(3)]
+
+ data = {"geometry1": points1, "geometry2": points2, "attribute": [1,
2, 3]}
+ df = GeoDataFrame(data)
+
+ # TODO: Try to optimize all of these with self.ps_allow_diff_frames()
calls away
+ with self.ps_allow_diff_frames():
+ df = df.set_geometry("geometry1")
+ assert df.geometry.name == "geometry1"
+
+ with self.ps_allow_diff_frames():
+ df = df.rename_geometry("geometry3")
+ assert df.geometry.name == "geometry3"
+
+ # test inplace rename
+ with self.ps_allow_diff_frames():
+ df.rename_geometry("geometry4", inplace=True)
+ assert df.geometry.name == "geometry4"
+
def test_area(self):
# Create a GeoDataFrame with polygons to test area calculation
from shapely.geometry import Polygon
diff --git a/python/tests/geopandas/test_geopandas_base.py
b/python/tests/geopandas/test_geopandas_base.py
new file mode 100644
index 0000000000..824a53d1b4
--- /dev/null
+++ b/python/tests/geopandas/test_geopandas_base.py
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tests.test_base import TestBase
+from sedona.geopandas import GeoDataFrame, GeoSeries
+import pyspark.sql
+import geopandas as gpd
+import pandas as pd
+import pyspark.pandas as ps
+from pandas.testing import assert_series_equal
+from contextlib import contextmanager
+
+
+class TestGeopandasBase(TestBase):
+ #
-----------------------------------------------------------------------------
+ # # Utils
+ #
-----------------------------------------------------------------------------
+
+ @classmethod
+ def check_sgpd_equals_spark_df(
+ cls, actual: GeoSeries, expected: pyspark.sql.DataFrame
+ ):
+ assert isinstance(actual, GeoSeries)
+ assert isinstance(expected, pyspark.sql.DataFrame)
+ expected = expected.selectExpr("ST_AsText(expected) as expected")
+ sgpd_result = actual.to_geopandas()
+ expected = expected.toPandas()["expected"]
+ for a, e in zip(sgpd_result, expected):
+ cls.assert_geometry_almost_equal(a, e)
+
+ # TODO chore: rename to check_sgpd_series_equals_gpd_series and change the
names in the geoseries tests
+ @classmethod
+ def check_sgpd_equals_gpd(cls, actual: GeoSeries, expected: gpd.GeoSeries):
+ assert isinstance(actual, GeoSeries)
+ assert isinstance(expected, gpd.GeoSeries)
+ sgpd_result = actual.to_geopandas()
+ for a, e in zip(sgpd_result, expected):
+ if a is None or e is None:
+ assert a is None and e is None
+ continue
+ # Sometimes sedona and geopandas both return empty geometries but
of different types (e.g Point and Polygon)
+ elif a.is_empty and e.is_empty:
+ continue
+ cls.assert_geometry_almost_equal(
+ a, e, tolerance=1e-2
+ ) # increased tolerance from 1e-6
+
+ @classmethod
+ def check_sgpd_df_equals_gpd_df(
+ cls, actual: GeoDataFrame, expected: gpd.GeoDataFrame
+ ):
+ assert isinstance(actual, GeoDataFrame)
+ assert isinstance(expected, gpd.GeoDataFrame)
+ assert len(actual.columns) == len(expected.columns)
+ for col_name in actual.keys():
+ actual_series, expected_series = actual[col_name],
expected[col_name]
+ if isinstance(actual_series, GeoSeries):
+ assert isinstance(actual_series, GeoSeries)
+ # original geopandas does not guarantee a GeoSeries will be
returned, so convert it here
+ expected_series = gpd.GeoSeries(expected_series)
+ cls.check_sgpd_equals_gpd(actual_series, expected_series)
+ else:
+ assert isinstance(actual_series, ps.Series)
+ assert isinstance(expected_series, pd.Series)
+ cls.check_pd_series_equal(actual_series, expected_series)
+
+ @classmethod
+ def check_pd_series_equal(cls, actual: ps.Series, expected: pd.Series):
+ assert isinstance(actual, ps.Series)
+ assert isinstance(expected, pd.Series)
+ assert_series_equal(actual.to_pandas(), expected)
+
+ @contextmanager
+ def ps_allow_diff_frames(self):
+ """
+ A context manager to temporarily set a compute.ops_on_diff_frames
option.
+ """
+ try:
+ ps.set_option("compute.ops_on_diff_frames", True)
+
+ # Yield control to the code inside the 'with' block
+ yield
+ finally:
+ ps.reset_option("compute.ops_on_diff_frames")
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 352f73e766..dbbe53fdb4 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -22,7 +22,7 @@ import pandas as pd
import geopandas as gpd
import sedona.geopandas as sgpd
from sedona.geopandas import GeoSeries
-from tests.test_base import TestBase
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
from shapely import wkt
from shapely.geometry import (
Point,
@@ -38,7 +38,7 @@ from pandas.testing import assert_series_equal
import pytest
-class TestGeoSeries(TestBase):
+class TestGeoSeries(TestGeopandasBase):
def setup_method(self):
self.geoseries = sgpd.GeoSeries(
[
@@ -55,19 +55,6 @@ class TestGeoSeries(TestBase):
]
)
- def check_sgpd_equals_gpd(self, actual: sgpd.GeoSeries, expected:
gpd.GeoSeries):
- assert isinstance(actual, sgpd.GeoSeries)
- assert isinstance(expected, gpd.GeoSeries)
- assert len(actual) == len(expected)
- sgpd_result = actual.to_geopandas()
- for a, e in zip(sgpd_result, expected):
- if a is None or e is None:
- assert a is None and e is None
- continue
- elif a.is_empty and e.is_empty:
- continue
- self.assert_geometry_almost_equal(a, e)
-
def test_area(self):
result = self.geoseries.area.to_pandas()
expected = pd.Series([0.0, 0.0, 5.23, 5.23])
diff --git a/python/tests/geopandas/test_match_geopandas_dataframe.py
b/python/tests/geopandas/test_match_geopandas_dataframe.py
new file mode 100644
index 0000000000..acc7487431
--- /dev/null
+++ b/python/tests/geopandas/test_match_geopandas_dataframe.py
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import shutil
+import tempfile
+from shapely.geometry import (
+ Point,
+ Polygon,
+ MultiPoint,
+ MultiLineString,
+ LineString,
+ MultiPolygon,
+ GeometryCollection,
+ LinearRing,
+)
+
+from packaging.version import parse as parse_version
+from sedona.geopandas import GeoDataFrame, GeoSeries
+import geopandas as gpd
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
+import pyspark.pandas as ps
+
+
+class TestMatchGeopandasDataFrame(TestGeopandasBase):
+ def setup_method(self):
+ self.tempdir = tempfile.mkdtemp()
+
+ rows = 3
+
+ self.points = [Point(x, x + 1, x + 2) for x in range(rows)]
+
+ self.multipoints = [
+ MultiPoint([(x, x + 1), (x + 2, x + 3)]) for x in range(rows)
+ ]
+
+ self.linestrings = [
+ LineString([(x, x + 1), (x + 2, x + 3)]) for x in range(rows)
+ ]
+
+ self.multilinestrings = [
+ MultiLineString(
+ [[[x, x + 1], [x + 2, x + 3]], [[x + 4, x + 5], [x + 6, x +
7]]]
+ )
+ for x in range(rows)
+ ]
+
+ self.polygons = [
+ Polygon(
+ [(x, 0, x + 2), (x + 1, 0, x + 3), (x + 2, 1, x + 4), (x + 3,
1, x + 5)]
+ )
+ for x in range(rows)
+ ]
+
+ self.multipolygons = [
+ MultiPolygon(
+ [
+ (
+ [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+ [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1, 0.1)]],
+ )
+ ]
+ )
+ for x in range(rows)
+ ]
+
+ self.geomcollection = [
+ GeometryCollection(
+ [
+ MultiPoint([(0, 0), (1, 1)]),
+ MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
+ MultiPolygon(
+ [
+ (
+ [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+ [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1,
0.1)]],
+ )
+ ]
+ ),
+ ]
+ )
+ for x in range(rows)
+ ]
+
+ self.geometries = {
+ "points": self.points,
+ "multipoints": self.multipoints,
+ "linestrings": self.linestrings,
+ "multilinestrings": self.multilinestrings,
+ "polygons": self.polygons,
+ "multipolygons": self.multipolygons,
+ "geomcollection": self.geomcollection,
+ }
+
+ def teardown_method(self):
+ shutil.rmtree(self.tempdir)
+
+ def test_getitem(self):
+ sgpd_df = GeoDataFrame(self.geometries)
+ gpd_df = gpd.GeoDataFrame(self.geometries)
+ for key in self.geometries.keys():
+ actual_series, expected_series = sgpd_df[key], gpd_df[key]
+ if isinstance(actual_series, GeoSeries):
+ # original geopandas does not guarantee a GeoSeries will be
returned, so convert it here
+ expected_series = gpd.GeoSeries(expected_series)
+ self.check_sgpd_equals_gpd(actual_series, expected_series)
+ else:
+ self.check_pd_series_equal(actual_series, expected_series) #
type: ignore
+
+ self.check_sgpd_df_equals_gpd_df(sgpd_df, gpd_df)
+
+ def test_set_geometry(self):
+ sgpd_df = GeoDataFrame(self.geometries)
+ gpd_df = gpd.GeoDataFrame(self.geometries)
+
+ with self.ps_allow_diff_frames():
+ sgpd_df = sgpd_df.set_geometry("points")
+ gpd_df = gpd_df.set_geometry("points")
+ assert sgpd_df.geometry.name == gpd_df.geometry.name
+
+ self.check_sgpd_df_equals_gpd_df(sgpd_df, gpd_df)
+
+ def test_active_geometry_name(self):
+ if parse_version(gpd.__version__) < parse_version("1.0.0"):
+ return
+
+ sgpd_df = GeoDataFrame(self.geometries)
+ gpd_df = gpd.GeoDataFrame(self.geometries)
+
+ with self.ps_allow_diff_frames():
+ sgpd_df = sgpd_df.set_geometry("polygons")
+ gpd_df = gpd_df.set_geometry("polygons")
+ assert sgpd_df.geometry.name == gpd_df.geometry.name
+ assert (
+ sgpd_df.active_geometry_name
+ == gpd_df.active_geometry_name
+ == sgpd_df.geometry.name
+ )
+
+ def test_rename_geometry(self):
+ sgpd_df = GeoDataFrame(self.geometries)
+ gpd_df = gpd.GeoDataFrame(self.geometries)
+
+ with self.ps_allow_diff_frames():
+ sgpd_df = sgpd_df.set_geometry("polygons")
+ gpd_df = gpd_df.set_geometry("polygons")
+ assert sgpd_df.geometry.name == gpd_df.geometry.name
+
+ # test inplace
+ with self.ps_allow_diff_frames():
+ sgpd_df.rename_geometry("random", inplace=True)
+ gpd_df.rename_geometry("random", inplace=True)
+ assert sgpd_df.geometry.name == gpd_df.geometry.name
+
+ # Ensure the names are different when we rename to different names
+ with self.ps_allow_diff_frames():
+ sgpd_df = sgpd_df.rename_geometry("name1")
+ gpd_df = gpd_df.rename_geometry("name2")
+ assert sgpd_df.geometry.name != gpd_df.geometry.name
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
index a03e1394d1..6e38992f67 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -36,12 +36,12 @@ from shapely.geometry import (
)
from sedona.geopandas import GeoSeries
-from tests.test_base import TestBase
+from tests.geopandas.test_geopandas_base import TestGeopandasBase
import pyspark.pandas as ps
from packaging.version import parse as parse_version
-class TestMatchGeopandasSeries(TestBase):
+class TestMatchGeopandasSeries(TestGeopandasBase):
def setup_method(self):
self.tempdir = tempfile.mkdtemp()
self.t1 = Polygon([(0, 0), (1, 0), (1, 1)])
@@ -412,7 +412,7 @@ class TestMatchGeopandasSeries(TestBase):
def test_is_valid_reason(self):
# is_valid_reason was added in geopandas 1.0.0
- if gpd.__version__ < "1.0.0":
+ if parse_version(gpd.__version__) < parse_version("1.0.0"):
return
data = [
Polygon([(0, 0), (1, 1), (0, 1)]),
@@ -704,38 +704,3 @@ class TestMatchGeopandasSeries(TestBase):
sgpd_series = sgpd_series.set_crs(epsg=3857, allow_override=True)
gpd_series = gpd_series.set_crs(epsg=3857, allow_override=True)
assert sgpd_series.crs == gpd_series.crs
-
- #
-----------------------------------------------------------------------------
- # # Utils
- #
-----------------------------------------------------------------------------
-
- def check_sgpd_equals_spark_df(
- self, actual: GeoSeries, expected: pyspark.sql.DataFrame
- ):
- assert isinstance(actual, GeoSeries)
- assert isinstance(expected, pyspark.sql.DataFrame)
- expected = expected.selectExpr("ST_AsText(expected) as expected")
- sgpd_result = actual.to_geopandas()
- expected = expected.toPandas()["expected"]
- for a, e in zip(sgpd_result, expected):
- self.assert_geometry_almost_equal(a, e)
-
- def check_sgpd_equals_gpd(self, actual: GeoSeries, expected:
gpd.GeoSeries):
- assert isinstance(actual, GeoSeries)
- assert isinstance(expected, gpd.GeoSeries)
- sgpd_result = actual.to_geopandas()
- for a, e in zip(sgpd_result, expected):
- if a is None or e is None:
- assert a is None and e is None
- continue
- # Sometimes sedona and geopandas both return empty geometries but
of different types (e.g Point and Polygon)
- elif a.is_empty and e.is_empty:
- continue
- self.assert_geometry_almost_equal(
- a, e, tolerance=1e-2
- ) # increased tolerance from 1e-6
-
- def check_pd_series_equal(self, actual: ps.Series, expected: pd.Series):
- assert isinstance(actual, ps.Series)
- assert isinstance(expected, pd.Series)
- assert_series_equal(actual.to_pandas(), expected)