This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 22307f4125 [GH-2269] Geopandas: Fix geodataframe.copy() to properly
create a copy (#2270)
22307f4125 is described below
commit 22307f4125abb46a78f0a30ba3fc3045aa21e91e
Author: Peter Nguyen <[email protected]>
AuthorDate: Mon Aug 11 22:58:08 2025 -0700
[GH-2269] Geopandas: Fix geodataframe.copy() to properly create a copy
(#2270)
---
python/sedona/spark/geopandas/geodataframe.py | 14 +++++-----
python/tests/geopandas/test_geodataframe.py | 39 +++++++++++++++------------
2 files changed, 28 insertions(+), 25 deletions(-)
diff --git a/python/sedona/spark/geopandas/geodataframe.py
b/python/sedona/spark/geopandas/geodataframe.py
index 2470f310a0..6e379dca69 100644
--- a/python/sedona/spark/geopandas/geodataframe.py
+++ b/python/sedona/spark/geopandas/geodataframe.py
@@ -574,7 +574,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
if inplace:
frame = self
else:
- frame = self.copy(deep=False)
+ frame = self.copy()
geo_column_name = self._geometry_column_name
new_series = False
@@ -785,7 +785,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
"""
return pspd.DataFrame(self._internal)
- def copy(self, deep=False):
+ def copy(self, deep=False) -> GeoDataFrame:
"""
Make a copy of this GeoDataFrame object.
@@ -810,12 +810,10 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
geometry value1 value2
0 POINT (1 1) 2 3
"""
- if deep:
- return GeoDataFrame(
- self._anchor.copy(), dtype=self.dtypes, index=self._col_label
- )
- else:
- return self # GeoDataFrame(self._internal.spark_frame.copy())
"this parameter is not supported but just dummy parameter to match pandas."
+ # Note: The deep parameter is a dummy parameter just as it is in
PySpark pandas
+ return GeoDataFrame(
+ pspd.DataFrame(self._internal.copy()),
geometry=self.active_geometry_name
+ )
def _safe_get_crs(self):
"""
diff --git a/python/tests/geopandas/test_geodataframe.py
b/python/tests/geopandas/test_geodataframe.py
index 47b5eb388b..5780a9e36c 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -277,9 +277,12 @@ class TestGeoDataFrame(TestGeopandasBase):
)
def test_copy(self):
- df = GeoDataFrame([Point(x, x) for x in range(3)], name="test_df")
- df_copy = df.copy()
- assert type(df_copy) is GeoDataFrame
+ df = GeoDataFrame(
+ {"test": [Point(x, x) for x in range(3)]}, index=[1, 2, 3],
geometry="test"
+ )
+ result = df.copy()
+ self.check_sgpd_df_equals_gpd_df(result, df.to_geopandas())
+ self.check_sgpd_df_equals_gpd_df(df, result.to_geopandas())
def test_set_crs(self):
sgpd_df = sgpd.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
@@ -297,9 +300,12 @@ class TestGeoDataFrame(TestGeopandasBase):
assert sgpd_df.crs is None
with self.ps_allow_diff_frames():
- sgpd_df = sgpd_df.set_crs(4326, allow_override=True)
- assert isinstance(sgpd_df, GeoDataFrame)
- assert sgpd_df.crs.to_epsg() == 4326
+ result = sgpd_df.set_crs(4326, allow_override=True)
+ assert result.crs.to_epsg() == 4326
+ assert isinstance(result, GeoDataFrame)
+
+ # Ensure set_crs without inplace modifies a copy and not current df
+ assert sgpd_df.crs is None
def test_to_crs(self):
from pyproj import CRS
@@ -313,6 +319,8 @@ class TestGeoDataFrame(TestGeopandasBase):
with self.ps_allow_diff_frames():
result = gdf.to_crs(3857)
assert isinstance(result.crs, CRS) and result.crs.to_epsg() == 3857
+ # Ensure original df is not modified
+ assert gdf.crs.to_epsg() == 4326
expected = gpd.GeoSeries(
[
@@ -338,21 +346,18 @@ class TestGeoDataFrame(TestGeopandasBase):
with pytest.raises(MissingGeometryColumnError):
_ = sgpd_df.geometry
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
- sgpd_df = sgpd_df.set_geometry("geometry1")
+ sgpd_df.set_geometry("geometry1", inplace=True)
assert sgpd_df.geometry.name == "geometry1"
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
- sgpd_df.set_geometry("geometry2", inplace=True)
- assert sgpd_df.geometry.name == "geometry2"
+ result = sgpd_df.set_geometry("geometry2")
+ assert result.geometry.name == "geometry2"
- # Test the actual values of the geometry column
- assert_series_equal(
- sgpd_df.geometry.area.to_pandas(),
sgpd_df["geometry2"].area.to_pandas()
- )
+ # Ensure original df is not modified
+ assert sgpd_df.geometry.name == "geometry1"
+
+ # Test the actual values of the geometry column equal for an area
calculation
+ self.check_pd_series_equal(result.area,
sgpd_df["geometry2"].area.to_pandas())
# unknown column
with pytest.raises(ValueError):