This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new e87eaa9875 [GH-2201] Geopandas: Optimize crs operations, getitem, and
set_geometry + impl scalable getitem (#2203)
e87eaa9875 is described below
commit e87eaa9875c07cf00a55319913bf02b45cfdbd6f
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Jul 31 08:34:29 2025 -0700
[GH-2201] Geopandas: Optimize crs operations, getitem, and set_geometry +
impl scalable getitem (#2203)
* Improvements to geodataframe's getitem, constructor, and set_geometry +
add to_spark_pandas function
* Add tests
* Clean up
* Implement crs, set_crs for GeoDataFrame correctly
* Make Geoseries.set_crs lazy
* Make geodataframe getitem and set_geometry lazy
* Optimize geoseries get crs by using first_value to pre-filter
* Set allow_override=True default in geodataframe too
* Clarify comment
* Turn np.nan to 0 in when getting geoseries crs
* Apply suggestions from code review
Co-authored-by: Copilot <[email protected]>
* Implement and test to_crs for geodataframe
* empty commit (stac tests flaked twice in a row)
---------
Co-authored-by: Copilot <[email protected]>
---
python/sedona/geopandas/geodataframe.py | 316 ++++++++++++++++++++--------
python/sedona/geopandas/geoseries.py | 80 ++++---
python/tests/geopandas/test_geodataframe.py | 120 +++++++++++
python/tests/geopandas/test_geoseries.py | 22 +-
4 files changed, 418 insertions(+), 120 deletions(-)
diff --git a/python/sedona/geopandas/geodataframe.py
b/python/sedona/geopandas/geodataframe.py
index b2c3060c04..fa4b1bc825 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -309,55 +309,22 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
Name: value, dtype: int64
"""
- # Handle column access by name
- if isinstance(key, str):
- # Access column directly from the spark DataFrame
- column_name = key
-
- # Check if column exists
- if column_name not in self.columns:
- raise KeyError(f"Column '{column_name}' does not exist")
-
- # Here we are getting a ps.Series with the same underlying anchor
(ps.Dataframe).
- # This is important so we don't unnecessarily try to perform
operations on different dataframes
- ps_series: pspd.Series = pspd.DataFrame.__getitem__(self,
column_name)
-
+ # Here we are getting a ps.Series with the same underlying anchor
(ps.Dataframe).
+ # This is important so we don't unnecessarily try to perform
operations on different dataframes
+ item = pspd.DataFrame.__getitem__(self, key)
+
+ if isinstance(item, pspd.DataFrame):
+ # don't specify crs=self.crs here because it might not include the
geometry column
+ # if it does include the geometry column, we don't need to set crs
anyways
+ return GeoDataFrame(item)
+ elif isinstance(item, pspd.Series):
+ ps_series: pspd.Series = item
try:
- result = sgpd.GeoSeries(ps_series)
- not_null = ps_series[ps_series.notnull()]
- if len(not_null) > 0:
- first_geom = not_null.iloc[0]
- srid = shapely.get_srid(first_geom)
-
- # Shapely objects stored in the ps.Series retain their srid
- # but the GeoSeries does not, so we manually re-set it here
- if srid > 0:
- result.set_crs(srid, inplace=True)
- return result
+ return sgpd.GeoSeries(ps_series)
except TypeError:
return ps_series
-
- # Handle list of column names
- elif isinstance(key, list) and all(isinstance(k, str) for k in key):
- # Check if all columns exist
- missing_cols = [k for k in key if k not in self.columns]
- if missing_cols:
- raise KeyError(f"Columns {missing_cols} do not exist")
-
- # Select columns from the spark DataFrame
- spark_df = self._internal.spark_frame.select(*key)
- pandas_df = spark_df.toPandas()
-
- # Return as GeoDataFrame
- return GeoDataFrame(pandas_df)
-
- # Handle row selection via slice or boolean indexing
else:
- # For now, convert to pandas first for row-based operations
- # This could be optimized later for better performance
- pandas_df = self._internal.spark_frame.toPandas()
- selected_rows = pandas_df[key]
- return GeoDataFrame(selected_rows)
+ raise Exception(f"Logical Error: Unexpected type: {type(item)}")
_geometry_column_name = None
@@ -384,49 +351,45 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
from sedona.geopandas import GeoSeries
from pyspark.sql import DataFrame as SparkDataFrame
- if isinstance(data, GeoDataFrame):
- data_crs = data._safe_get_crs()
- if data_crs is not None:
- data.crs = data_crs
-
- super().__init__(data, index=index, columns=columns, dtype=dtype,
copy=copy)
-
- elif isinstance(data, GeoSeries):
- if data.crs is None:
+ if isinstance(data, (GeoDataFrame, GeoSeries)):
+ if crs:
data.crs = crs
# For each of these super().__init__() calls, we let pyspark
decide which inputs are valid or not
# instead of calling e.g assert not dtype ourselves.
# This way, if Spark adds support later, than we inherit those
changes naturally
super().__init__(data, index=index, columns=columns, dtype=dtype,
copy=copy)
+
elif isinstance(data, (PandasOnSparkDataFrame, SparkDataFrame)):
super().__init__(data, index=index, columns=columns, dtype=dtype,
copy=copy)
elif isinstance(data, PandasOnSparkSeries):
try:
- data = GeoSeries(data)
+ data = GeoSeries(data, crs=crs)
except TypeError:
pass
super().__init__(data, index=index, columns=columns, dtype=dtype,
copy=copy)
else:
# below are not distributed dataframe types
- if isinstance(data, pd.DataFrame):
- assert index is None
- assert dtype is None
- assert not copy
- # Need to convert GeoDataFrame to pd.DataFrame for below cast
to work
- pd_df = (
- pd.DataFrame(data) if isinstance(data, gpd.GeoDataFrame)
else data
- )
- else:
- pd_df = pd.DataFrame(
- data=data,
- index=index,
- dtype=dtype,
- copy=copy,
- )
+ if isinstance(data, gpd.GeoDataFrame):
+ # We can use GeoDataFrame.active_geometry_name once we drop
support for geopandas < 1.0.0
+ # Below is the equivalent, since active_geometry_name simply
calls _geometry_column_name
+ if data._geometry_column_name:
+ # Geopandas stores crs as metadata instead of inside of
the shapely objects so we must save it and set it manually later
+ if not crs:
+ crs = data.crs
+ if not geometry:
+ geometry = data.geometry.name
+
+ pd_df = pd.DataFrame(
+ data,
+ index=index,
+ columns=columns,
+ dtype=dtype,
+ copy=copy,
+ )
# Spark complains if it's left as a geometry type
geom_type_cols = pd_df.select_dtypes(include=["geometry"]).columns
@@ -447,7 +410,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
raise ValueError(crs_mismatch_error)
if geometry:
- self.set_geometry(geometry, inplace=True)
+ self.set_geometry(geometry, inplace=True, crs=crs)
if geometry is None and "geometry" in self.columns:
@@ -462,8 +425,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
geom_crs = geometry.crs
if geom_crs is None:
if crs is not None:
- geometry.set_crs(crs, inplace=True)
- self.set_geometry(geometry, inplace=True)
+ self.set_geometry(geometry, inplace=True, crs=crs)
else:
if crs is not None and geom_crs != crs:
raise ValueError(crs_mismatch_error)
@@ -625,6 +587,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
frame = self.copy(deep=False)
geo_column_name = self._geometry_column_name
+ new_series = False
if geo_column_name is None:
geo_column_name = "geometry"
@@ -647,6 +610,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
level = col.rename(geo_column_name)
else:
level = pspd.Series(col, name=geo_column_name)
+
+ if not isinstance(level, sgpd.GeoSeries):
+ # Set the crs later, so we can allow_override=True
+ level = sgpd.GeoSeries(level)
+
+ new_series = True
elif hasattr(col, "ndim") and col.ndim > 1:
raise ValueError("Must pass array with one dimension only.")
else: # should be a colname
@@ -689,20 +658,15 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
# if not dropping, set the active geometry name to the given
col name
geo_column_name = col
- if not crs:
- crs = getattr(level, "crs", None)
-
- # Check that we are using a listlike of geometries
- level = _ensure_geometry(level, crs=crs)
- # ensure_geometry only sets crs on level if it has crs==None
-
# This operation throws a warning to the user asking them to set
pspd.set_option('compute.ops_on_diff_frames', True)
# to allow operations on different frames. We pass these warnings on
to the user so they must manually set it themselves.
- if level.crs != crs:
+ if crs:
level.set_crs(crs, inplace=True, allow_override=True)
+ new_series = True
frame._geometry_column_name = geo_column_name
- frame[geo_column_name] = level
+ if new_series:
+ frame[geo_column_name] = level
if not inplace:
return frame
@@ -824,6 +788,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
return gpd.GeoDataFrame(pd_df, geometry=self._geometry_column_name)
+ def to_spark_pandas(self) -> pspd.DataFrame:
+ """
+ Convert the GeoDataFrame to a Spark Pandas DataFrame.
+ """
+ return pspd.DataFrame(self._internal)
+
@property
def sindex(self) -> SpatialIndex | None:
"""
@@ -887,10 +857,184 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
@crs.setter
def crs(self, value):
- # Avoid trying to access the geometry column (which might be missing)
if crs is None
- if value is None:
- return
- self.geometry.crs = value
+ # Since pyspark dataframes are immutable, we can't modify in place, so
we create the new geoseries and replace it
+ self.geometry = self.geometry.set_crs(value)
+
+ def set_crs(self, crs, inplace=False, allow_override=True):
+ """
+ Set the Coordinate Reference System (CRS) of the ``GeoDataFrame``.
+
+ If there are multiple geometry columns within the GeoDataFrame, only
+ the CRS of the active geometry column is set.
+
+ Pass ``None`` to remove CRS from the active geometry column.
+
+ Notes
+ -----
+ The underlying geometries are not transformed to this CRS. To
+ transform the geometries to a new CRS, use the ``to_crs`` method.
+
+ Parameters
+ ----------
+ crs : pyproj.CRS | None, optional
+ The value can be anything accepted
+ by :meth:`pyproj.CRS.from_user_input()
<pyproj.crs.CRS.from_user_input>`,
+ such as an authority string (eg "EPSG:4326") or a WKT string.
+ epsg : int, optional
+ EPSG code specifying the projection.
+ inplace : bool, default False
+ If True, the CRS of the GeoDataFrame will be changed in place
+ (while still returning the result) instead of making a copy of
+ the GeoDataFrame.
+ allow_override : bool, default True
+ If the GeoDataFrame already has a CRS, allow to replace the
+ existing CRS, even when both are not equal. In Sedona, setting
this to True
+ will lead to eager evaluation instead of lazy evaluation. Unlike
Geopandas,
+ True is the default value in Sedona for performance reasons.
+
+ Examples
+ --------
+ >>> from sedona.geopandas import GeoDataFrame
+ >>> from shapely.geometry import Point
+ >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2),
Point(2, 1)]}
+ >>> gdf = GeoDataFrame(d)
+ >>> gdf
+ col1 geometry
+ 0 name1 POINT (1 2)
+ 1 name2 POINT (2 1)
+
+ Setting CRS to a GeoDataFrame without one:
+
+ >>> gdf.crs is None
+ True
+
+ >>> gdf = gdf.set_crs('epsg:3857')
+ >>> gdf.crs # doctest: +SKIP
+ <Projected CRS: EPSG:3857>
+ Name: WGS 84 / Pseudo-Mercator
+ Axis Info [cartesian]:
+ - X[east]: Easting (metre)
+ - Y[north]: Northing (metre)
+ Area of Use:
+ - name: World - 85°S to 85°N
+ - bounds: (-180.0, -85.06, 180.0, 85.06)
+ Coordinate Operation:
+ - name: Popular Visualisation Pseudo-Mercator
+ - method: Popular Visualisation Pseudo Mercator
+ Datum: World Geodetic System 1984
+ - Ellipsoid: WGS 84
+ - Prime Meridian: Greenwich
+
+ Overriding existing CRS:
+
+ >>> gdf = gdf.set_crs(4326, allow_override=True)
+
+ Without ``allow_override=True``, ``set_crs`` returns an error if you
try to
+ override CRS.
+
+ See Also
+ --------
+ GeoDataFrame.to_crs : re-project to another CRS
+ """
+ # Since pyspark dataframes are immutable, we can't modify in place, so
we create the new geoseries and replace it
+ new_geometry = self.geometry.set_crs(crs,
allow_override=allow_override)
+ if inplace:
+ self.geometry = new_geometry
+ else:
+ df = self.copy()
+ df.geometry = new_geometry
+ return df
+
+ def to_crs(
+ self,
+ crs: Any | None = None,
+ epsg: int | None = None,
+ inplace: bool = False,
+ ) -> GeoDataFrame | None:
+ """Transform geometries to a new coordinate reference system.
+
+ Transform all geometries in an active geometry column to a different
coordinate
+ reference system. The ``crs`` attribute on the current GeoSeries must
+ be set. Either ``crs`` or ``epsg`` may be specified for output.
+
+ This method will transform all points in all objects. It has no notion
+ of projecting entire geometries. All segments joining points are
+ assumed to be lines in the current projection, not geodesics. Objects
+ crossing the dateline (or other projection boundary) will have
+ undesirable behavior.
+
+ Parameters
+ ----------
+ crs : pyproj.CRS, optional if `epsg` is specified
+ The value can be anything accepted by
+ :meth:`pyproj.CRS.from_user_input()
<pyproj.crs.CRS.from_user_input>`,
+ such as an authority string (eg "EPSG:4326") or a WKT string.
+ epsg : int, optional if `crs` is specified
+ EPSG code specifying output projection.
+ inplace : bool, optional, default: False
+ Whether to return a new GeoDataFrame or do the transformation in
+ place.
+
+ Returns
+ -------
+ GeoDataFrame
+
+ Examples
+ --------
+ >>> from shapely.geometry import Point
+ >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2),
Point(2, 1)]}
+ >>> gdf = geopandas.GeoDataFrame(d, crs=4326)
+ >>> gdf
+ col1 geometry
+ 0 name1 POINT (1 2)
+ 1 name2 POINT (2 1)
+ >>> gdf.crs # doctest: +SKIP
+ <Geographic 2D CRS: EPSG:4326>
+ Name: WGS 84
+ Axis Info [ellipsoidal]:
+ - Lat[north]: Geodetic latitude (degree)
+ - Lon[east]: Geodetic longitude (degree)
+ Area of Use:
+ - name: World
+ - bounds: (-180.0, -90.0, 180.0, 90.0)
+ Datum: World Geodetic System 1984
+ - Ellipsoid: WGS 84
+ - Prime Meridian: Greenwich
+
+ >>> gdf = gdf.to_crs(3857)
+ >>> gdf
+ col1 geometry
+ 0 name1 POINT (111319.491 222684.209)
+ 1 name2 POINT (222638.982 111325.143)
+ >>> gdf.crs # doctest: +SKIP
+ <Projected CRS: EPSG:3857>
+ Name: WGS 84 / Pseudo-Mercator
+ Axis Info [cartesian]:
+ - X[east]: Easting (metre)
+ - Y[north]: Northing (metre)
+ Area of Use:
+ - name: World - 85°S to 85°N
+ - bounds: (-180.0, -85.06, 180.0, 85.06)
+ Coordinate Operation:
+ - name: Popular Visualisation Pseudo-Mercator
+ - method: Popular Visualisation Pseudo Mercator
+ Datum: World Geodetic System 1984
+ - Ellipsoid: WGS 84
+ - Prime Meridian: Greenwich
+
+ See Also
+ --------
+ GeoDataFrame.set_crs : assign CRS without re-projection
+ """
+ new_geometry = self.geometry.to_crs(crs=crs, epsg=epsg)
+ if inplace:
+ df = self
+ df.geometry = new_geometry
+ return None
+ else:
+ df = self.copy()
+ df.geometry = new_geometry
+ return df
@classmethod
def from_dict(
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index 2075895681..5f2b8dcb19 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -15,10 +15,10 @@
# specific language governing permissions and limitations
# under the License.
-import os
import typing
from typing import Any, Union, Literal, List
+import numpy as np
import geopandas as gpd
import sedona.geopandas as sgpd
import pandas as pd
@@ -28,8 +28,8 @@ from pyspark.pandas import Series as PandasOnSparkSeries
from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
from pyspark.pandas.internal import InternalFrame
from pyspark.pandas.series import first_series
-from pyspark.pandas.utils import scol_for, log_advice
-from pyspark.sql.types import BinaryType, NullType
+from pyspark.pandas.utils import scol_for
+from pyspark.sql.types import NullType
from sedona.spark.sql.types import GeometryType
from sedona.spark.sql import st_aggregates as sta
@@ -361,16 +361,18 @@ class GeoSeries(GeoFrame, pspd.Series):
assert not copy
assert not fastpath
- data_crs = None
- if hasattr(data, "crs"):
- data_crs = data.crs
- if data_crs is not None and crs is not None and data_crs != crs:
- raise ValueError(
- "CRS mismatch between CRS of the passed geometries "
- "and 'crs'. Use 'GeoSeries.set_crs(crs, "
- "allow_override=True)' to overwrite CRS or "
- "'GeoSeries.to_crs(crs)' to reproject geometries. "
- )
+ # We don't check crs validity to keep the operation lazy.
+ # Keep the original code for now
+ # data_crs = None
+ # if hasattr(data, "crs"):
+ # data_crs = data.crs
+ # if data_crs is not None and crs is not None and data_crs != crs:
+ # raise ValueError(
+ # "CRS mismatch between CRS of the passed geometries "
+ # "and 'crs'. Use 'GeoSeries.set_crs(crs, "
+ # "allow_override=True)' to overwrite CRS or "
+ # "'GeoSeries.to_crs(crs)' to reproject geometries. "
+ # )
# PySpark Pandas' ps.Series.__init__() does not construction from a
# ps.Series input. For now, we manually implement the logic.
@@ -463,23 +465,36 @@ class GeoSeries(GeoFrame, pspd.Series):
if len(self) == 0:
return None
- spark_col = stf.ST_SRID(self.spark.column)
+ if parse_version(pyspark.__version__) >= parse_version("3.5.0"):
+ spark_col = stf.ST_SRID(F.first_value(self.spark.column,
ignoreNulls=True))
+ # Set this to avoid error complaining that we don't have a groupby
column
+ is_aggr = True
+ else:
+ spark_col = stf.ST_SRID(self.spark.column)
+ is_aggr = False
+
tmp_series = self._query_geometry_column(
spark_col,
returns_geom=False,
+ is_aggr=is_aggr,
)
# All geometries should have the same srid
# so we just take the srid of the first non-null element
- first_idx = tmp_series.first_valid_index()
- srid = tmp_series[first_idx] if first_idx is not None else 0
+
+ if parse_version(pyspark.__version__) >= parse_version("3.5.0"):
+ srid = tmp_series.item()
+ # Turn np.nan to 0 to avoid error
+ srid = 0 if np.isnan(srid) else srid
+ else:
+ first_idx = tmp_series.first_valid_index()
+ srid = tmp_series[first_idx] if first_idx is not None else 0
# Sedona returns 0 if doesn't exist
return CRS.from_user_input(srid) if srid != 0 else None
@crs.setter
def crs(self, value: Union["CRS", None]):
- # Implementation of the abstract method
self.set_crs(value, inplace=True)
@typing.overload
@@ -505,7 +520,7 @@ class GeoSeries(GeoFrame, pspd.Series):
crs: Union[Any, None] = None,
epsg: Union[int, None] = None,
inplace: bool = False,
- allow_override: bool = False,
+ allow_override: bool = True,
) -> Union["GeoSeries", None]:
"""
Set the Coordinate Reference System (CRS) of a ``GeoSeries``.
@@ -529,9 +544,11 @@ class GeoSeries(GeoFrame, pspd.Series):
If True, the CRS of the GeoSeries will be changed in place
(while still returning the result) instead of making a copy of
the GeoSeries.
- allow_override : bool, default False
+ allow_override : bool, default True
If the GeoSeries already has a CRS, allow to replace the
- existing CRS, even when both are not equal.
+ existing CRS, even when both are not equal. In Sedona, setting
this to True
+ will lead to eager evaluation instead of lazy evaluation. Unlike
Geopandas,
+ True is the default value in Sedona for performance reasons.
Returns
-------
@@ -589,19 +606,18 @@ class GeoSeries(GeoFrame, pspd.Series):
elif epsg is not None:
crs = CRS.from_epsg(epsg)
- curr_crs = self.crs
-
- # If CRS is the same, do nothing
- if curr_crs == crs:
- return
+ # The below block for the not allow_override case is eager due to the
self.crs call
+ # This hurts performance and user experience, hence the default being
set to True in Sedona
+ if not allow_override:
+ curr_crs = self.crs
- if not allow_override and curr_crs is not None and not curr_crs == crs:
- raise ValueError(
- "The GeoSeries already has a CRS which is not equal to the
passed "
- "CRS. Specify 'allow_override=True' to allow replacing the
existing "
- "CRS without doing any transformation. If you actually want to
"
- "transform the geometries, use 'GeoSeries.to_crs' instead."
- )
+ if curr_crs is not None and not curr_crs == crs:
+ raise ValueError(
+ "The GeoSeries already has a CRS which is not equal to the
passed "
+ "CRS. Specify 'allow_override=True' to allow replacing the
existing "
+ "CRS without doing any transformation. If you actually
want to "
+ "transform the geometries, use 'GeoSeries.to_crs' instead."
+ )
# 0 indicates no srid in sedona
new_epsg = crs.to_epsg() if crs else 0
diff --git a/python/tests/geopandas/test_geodataframe.py
b/python/tests/geopandas/test_geodataframe.py
index abda5010c3..bc50673d55 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -62,6 +62,16 @@ class TestGeoDataFrame(TestGeopandasBase):
sgpd_df = GeoDataFrame(obj)
check_geodataframe(sgpd_df)
+ def test_construct_from_geopandas(self):
+ gpd_df = gpd.GeoDataFrame(
+ {"geometry1": [Point(0, 0)]}, geometry="geometry1", crs="EPSG:3857"
+ )
+ with self.ps_allow_diff_frames():
+ sgpd_df = GeoDataFrame(gpd_df)
+ assert sgpd_df.crs == "EPSG:3857"
+ assert sgpd_df.geometry.crs == "EPSG:3857"
+ assert sgpd_df.geometry.name == "geometry1"
+
@pytest.mark.parametrize(
"obj",
[
@@ -126,6 +136,57 @@ class TestGeoDataFrame(TestGeopandasBase):
sgpd_df = sgpd.GeoDataFrame(obj)
assert_frame_equal(pd_df, sgpd_df.to_pandas())
+ def test_to_geopandas(self):
+ from geopandas.testing import assert_geodataframe_equal
+
+ data = {"geometry": [Point(x, x) for x in range(3)], "id": [1, 2, 3]}
+ index = [1, 2, 3]
+ crs = "EPSG:3857"
+ # TODO: try to optimize this away
+ with self.ps_allow_diff_frames():
+ result = GeoDataFrame(data, index=index, crs=crs).to_geopandas()
+ gpd_df = gpd.GeoDataFrame(data, index=index, crs=crs)
+ assert_geodataframe_equal(result, gpd_df)
+
+ def test_to_spark_pandas(self):
+ data = {"geometry": [Point(x, x) for x in range(3)], "id": [1, 2, 3]}
+ index = [1, 2, 3]
+ result = GeoDataFrame(data, index=index).to_spark_pandas()
+ ps_df = ps.DataFrame(data, index=index)
+ assert_frame_equal(result.to_pandas(), ps_df.to_pandas())
+
+ def test_getitem(self):
+ geoms = [Point(x, x) for x in range(3)]
+ ids = [1, 2, 3]
+ values = ["a", "b", "c"]
+ crs = "EPSG:3857"
+
+ with self.ps_allow_diff_frames():
+ df = GeoDataFrame({"geometry": geoms, "id": ids, "value": values},
crs=crs)
+
+ # get a single non-geometry series
+ result = df["id"]
+ expected = pd.Series(ids, name="id")
+ self.check_pd_series_equal(result, expected)
+
+ # get a single geometry series
+ result = df["geometry"]
+ expected = gpd.GeoSeries(geoms, name="geometry", crs=crs)
+ self.check_sgpd_equals_gpd(result, expected)
+
+ # get multiple columns
+ result = df[["id", "value"]]
+ # no crs because no geometry column
+ expected = gpd.GeoDataFrame({"id": ids, "value": values})
+ self.check_sgpd_df_equals_gpd_df(result, expected)
+
+ # get numerical slice
+ result = df[:2]
+ expected = gpd.GeoDataFrame(
+ {"geometry": geoms[:2], "id": ids[:2], "value": values[:2]},
crs=crs
+ )
+ self.check_sgpd_df_equals_gpd_df(result, expected)
+
def test_plot(self):
# Just make sure it doesn't error
df = GeoDataFrame(
@@ -220,6 +281,50 @@ class TestGeoDataFrame(TestGeopandasBase):
df_copy = df.copy()
assert type(df_copy) is GeoDataFrame
+ def test_set_crs(self):
+ sgpd_df = sgpd.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
+ with self.ps_allow_diff_frames():
+ sgpd_df.crs = 4326
+ assert sgpd_df.crs.to_epsg() == 4326
+
+ with self.ps_allow_diff_frames():
+ sgpd_df.set_crs(3857, inplace=True, allow_override=True)
+ assert sgpd_df.crs.to_epsg() == 3857
+
+ with self.ps_allow_diff_frames():
+ sgpd_df = sgpd_df.set_crs(None, allow_override=True)
+ assert isinstance(sgpd_df, GeoDataFrame)
+ assert sgpd_df.crs is None
+
+ with self.ps_allow_diff_frames():
+ sgpd_df = sgpd_df.set_crs(4326, allow_override=True)
+ assert isinstance(sgpd_df, GeoDataFrame)
+ assert sgpd_df.crs.to_epsg() == 4326
+
+ def test_to_crs(self):
+ from pyproj import CRS
+
+ with self.ps_allow_diff_frames():
+ gdf = sgpd.GeoDataFrame(
+ {"geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]}, crs=4326
+ )
+ assert isinstance(gdf.crs, CRS) and gdf.crs.to_epsg() == 4326
+
+ with self.ps_allow_diff_frames():
+ result = gdf.to_crs(3857)
+ assert isinstance(result.crs, CRS) and result.crs.to_epsg() == 3857
+
+ expected = gpd.GeoSeries(
+ [
+ Point(111319.49079327356, 111325.14286638486),
+ Point(222638.98158654712, 222684.20850554455),
+ Point(333958.4723798207, 334111.1714019597),
+ ],
+ name="geometry",
+ crs=3857,
+ )
+ self.check_sgpd_equals_gpd(result.geometry, expected)
+
def test_set_geometry(self):
from sedona.geopandas.geodataframe import MissingGeometryColumnError
@@ -279,6 +384,21 @@ class TestGeoDataFrame(TestGeopandasBase):
assert new_df.crs == sgpd_df.crs
assert new_df.geometry.crs == sgpd_df.crs
+ def test_set_geometry_crs(self):
+ df = GeoDataFrame({"geometry1": [Point(0, 0)]})
+ with self.ps_allow_diff_frames():
+ df.set_geometry("geometry1", crs="EPSG:3857", inplace=True)
+ assert df.crs == "EPSG:3857"
+ assert df.geometry.crs == "EPSG:3857"
+
+ with self.ps_allow_diff_frames():
+ df = GeoDataFrame(
+ {"geometry1": [Point(0, 0)]}, geometry="geometry1",
crs="EPSG:3857"
+ )
+
+ assert df.crs == "EPSG:3857"
+ assert df.geometry.crs == "EPSG:3857"
+
def test_active_geometry_name(self):
if parse_version(gpd.__version__) < parse_version("1.0.0"):
return
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index d73545f08a..5f9b439874 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -95,6 +95,24 @@ class TestGeoSeries(TestGeopandasBase):
sgpd_series = GeoSeries(obj)
assert isinstance(sgpd_series, sgpd.GeoSeries)
+ def test_to_geopandas(self):
+ from geopandas.testing import assert_geoseries_equal
+
+ data = [Point(x, x) for x in range(3)]
+ index = [1, 2, 3]
+ crs = "EPSG:3857"
+ result = GeoSeries(data, index=index, crs=crs).to_geopandas()
+ gpd_df = gpd.GeoSeries(data, index=index, crs=crs)
+ assert_geoseries_equal(result, gpd_df)
+
+ def test_to_spark_pandas(self):
+ data = [Point(x, x) for x in range(3)]
+ index = [1, 2, 3]
+ crs = "EPSG:3857"
+ result = GeoSeries(data, index=index, crs=crs).to_spark_pandas()
+ ps_df = ps.Series(data, index=index)
+ assert_series_equal(result.to_pandas(), ps_df.to_pandas())
+
def test_plot(self):
# Just make sure it doesn't error
self.geoseries.plot()
@@ -1642,9 +1660,9 @@ e": "Feature", "properties": {}, "geometry": {"type":
"Point", "coordinates": [3
assert geo_series.crs.to_epsg() == 4326
with pytest.raises(ValueError):
- geo_series.set_crs(4328)
+ geo_series.set_crs(4328, allow_override=False)
with pytest.raises(ValueError):
- geo_series.crs = None
+ geo_series.set_crs(None, allow_override=False)
geo_series = geo_series.set_crs(None, allow_override=True)
assert geo_series.crs == None