This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new aa7fb3edf4 [GH-2281] Geopandas: set ps option
'compute.ops_on_diff_frames' to True by default (#2282)
aa7fb3edf4 is described below
commit aa7fb3edf466f29cb07bb2388008f43349e61e95
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Aug 19 13:42:04 2025 -0700
[GH-2281] Geopandas: set ps option 'compute.ops_on_diff_frames' to True by
default (#2282)
* Set ps.compute.ops_on_diff_frames to True by default for users, and False
for testing
* Remove unnecessary 'with self.ps_allow_diff_frames():' calls
* Remove 'self.ps_allow_diff_frames' calls and use built in
'ps.option_context(compute.ops_on_diff_frames, True)' instead
* PR feedback
---
python/sedona/spark/geopandas/__init__.py | 11 +++++
python/tests/geopandas/test_geodataframe.py | 51 +++++++++-------------
python/tests/geopandas/test_geopandas_base.py | 17 ++------
python/tests/geopandas/test_geoseries.py | 32 ++++++--------
.../geopandas/test_match_geopandas_dataframe.py | 41 ++++++-----------
5 files changed, 63 insertions(+), 89 deletions(-)
diff --git a/python/sedona/spark/geopandas/__init__.py
b/python/sedona/spark/geopandas/__init__.py
index 736fa3e939..cf6c0af175 100644
--- a/python/sedona/spark/geopandas/__init__.py
+++ b/python/sedona/spark/geopandas/__init__.py
@@ -26,3 +26,14 @@ from sedona.spark.geopandas.geodataframe import GeoDataFrame
from sedona.spark.geopandas.tools import sjoin
from sedona.spark.geopandas.io import read_file, read_parquet
+
+# This used to default to False, but Spark 4.0.0 changed it to True
+# We also want also it to default to True for Sedona, so we set it here
+# to apply the change for users using Spark < 4.0.0.
+
+# Having this set to False will cause these errors, which most users should
not have to worry about:
+# ValueError: Cannot combine the series or dataframe because it comes from a
different dataframe.
+# In order to allow this operation, enable 'compute.ops_on_diff_frames' option.
+import pyspark.pandas as ps
+
+ps.set_option("compute.ops_on_diff_frames", True)
diff --git a/python/tests/geopandas/test_geodataframe.py
b/python/tests/geopandas/test_geodataframe.py
index 5780a9e36c..372efeeadd 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -58,15 +58,14 @@ class TestGeoDataFrame(TestGeopandasBase):
],
)
def test_constructor(self, obj):
- with self.ps_allow_diff_frames():
- sgpd_df = GeoDataFrame(obj)
+ sgpd_df = GeoDataFrame(obj)
check_geodataframe(sgpd_df)
def test_construct_from_geopandas(self):
gpd_df = gpd.GeoDataFrame(
{"geometry1": [Point(0, 0)]}, geometry="geometry1", crs="EPSG:3857"
)
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
sgpd_df = GeoDataFrame(gpd_df)
assert sgpd_df.crs == "EPSG:3857"
assert sgpd_df.geometry.crs == "EPSG:3857"
@@ -143,7 +142,7 @@ class TestGeoDataFrame(TestGeopandasBase):
index = [1, 2, 3]
crs = "EPSG:3857"
# TODO: try to optimize this away
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
result = GeoDataFrame(data, index=index, crs=crs).to_geopandas()
gpd_df = gpd.GeoDataFrame(data, index=index, crs=crs)
assert_geodataframe_equal(result, gpd_df)
@@ -161,7 +160,7 @@ class TestGeoDataFrame(TestGeopandasBase):
values = ["a", "b", "c"]
crs = "EPSG:3857"
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
df = GeoDataFrame({"geometry": geoms, "id": ids, "value": values},
crs=crs)
# get a single non-geometry series
@@ -286,20 +285,20 @@ class TestGeoDataFrame(TestGeopandasBase):
def test_set_crs(self):
sgpd_df = sgpd.GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]})
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
sgpd_df.crs = 4326
assert sgpd_df.crs.to_epsg() == 4326
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
sgpd_df.set_crs(3857, inplace=True, allow_override=True)
assert sgpd_df.crs.to_epsg() == 3857
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
sgpd_df = sgpd_df.set_crs(None, allow_override=True)
assert isinstance(sgpd_df, GeoDataFrame)
assert sgpd_df.crs is None
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
result = sgpd_df.set_crs(4326, allow_override=True)
assert result.crs.to_epsg() == 4326
assert isinstance(result, GeoDataFrame)
@@ -310,13 +309,13 @@ class TestGeoDataFrame(TestGeopandasBase):
def test_to_crs(self):
from pyproj import CRS
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
gdf = sgpd.GeoDataFrame(
{"geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]}, crs=4326
)
assert isinstance(gdf.crs, CRS) and gdf.crs.to_epsg() == 4326
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
result = gdf.to_crs(3857)
assert isinstance(result.crs, CRS) and result.crs.to_epsg() == 3857
# Ensure original df is not modified
@@ -370,20 +369,20 @@ class TestGeoDataFrame(TestGeopandasBase):
# new crs - setting should default to GeoSeries' crs
gs = GeoSeries(geom, crs="epsg:3857")
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
new_df = sgpd_df.set_geometry(gs)
assert new_df.crs == "epsg:3857"
# explicit crs overrides self and dataframe
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
new_df = sgpd_df.set_geometry(gs, crs="epsg:26909")
assert new_df.crs == "epsg:26909"
assert new_df.geometry.crs == "epsg:26909"
# Series should use dataframe's crs
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
new_df = sgpd_df.set_geometry(geom.values)
assert new_df.crs == sgpd_df.crs
@@ -391,12 +390,12 @@ class TestGeoDataFrame(TestGeopandasBase):
def test_set_geometry_crs(self):
df = GeoDataFrame({"geometry1": [Point(0, 0)]})
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
df.set_geometry("geometry1", crs="EPSG:3857", inplace=True)
assert df.crs == "EPSG:3857"
assert df.geometry.crs == "EPSG:3857"
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
df = GeoDataFrame(
{"geometry1": [Point(0, 0)]}, geometry="geometry1",
crs="EPSG:3857"
)
@@ -414,14 +413,10 @@ class TestGeoDataFrame(TestGeopandasBase):
data = {"geometry1": points1, "geometry2": points2, "attribute": [1,
2, 3]}
df = GeoDataFrame(data)
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
- df = df.set_geometry("geometry1")
+ df = df.set_geometry("geometry1")
assert df.geometry.name == df.active_geometry_name == "geometry1"
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
- df.set_geometry("geometry2", inplace=True)
+ df.set_geometry("geometry2", inplace=True)
assert df.geometry.name == df.active_geometry_name == "geometry2"
def test_rename_geometry(self):
@@ -431,18 +426,14 @@ class TestGeoDataFrame(TestGeopandasBase):
data = {"geometry1": points1, "geometry2": points2, "attribute": [1,
2, 3]}
df = GeoDataFrame(data)
- # TODO: Try to optimize all of these with self.ps_allow_diff_frames()
calls away
- with self.ps_allow_diff_frames():
- df = df.set_geometry("geometry1")
+ df = df.set_geometry("geometry1")
assert df.geometry.name == "geometry1"
- with self.ps_allow_diff_frames():
- df = df.rename_geometry("geometry3")
+ df = df.rename_geometry("geometry3")
assert df.geometry.name == "geometry3"
# test inplace rename
- with self.ps_allow_diff_frames():
- df.rename_geometry("geometry4", inplace=True)
+ df.rename_geometry("geometry4", inplace=True)
assert df.geometry.name == "geometry4"
def test_area(self):
@@ -535,7 +526,7 @@ class TestGeoDataFrame(TestGeopandasBase):
d = {"col1": ["name1", "name2"], "geometry": [Point(1, 2), Point(2,
1)]}
# Currently, adding the crs information later requires us to join
across partitions
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
gdf = GeoDataFrame(d, crs="EPSG:3857")
result = gdf.to_json()
diff --git a/python/tests/geopandas/test_geopandas_base.py
b/python/tests/geopandas/test_geopandas_base.py
index 10b9a9271e..d2171bc1e1 100644
--- a/python/tests/geopandas/test_geopandas_base.py
+++ b/python/tests/geopandas/test_geopandas_base.py
@@ -33,6 +33,10 @@ class TestGeopandasBase(TestBase):
#
-----------------------------------------------------------------------------
# # Utils
#
-----------------------------------------------------------------------------
+ def setup_method(self):
+ # We enable this option by default for external users, but we disable
it for development testing.
+ # This is useful to catch inefficiencies in the code while developing
this package.
+ ps.set_option("compute.ops_on_diff_frames", False)
@classmethod
def check_sgpd_equals_spark_df(
@@ -114,19 +118,6 @@ class TestGeopandasBase(TestBase):
def contains_any_geom_collection(cls, geoms) -> bool:
return any(isinstance(g, GeometryCollection) for g in geoms)
- @contextmanager
- def ps_allow_diff_frames(self):
- """
- A context manager to temporarily set a compute.ops_on_diff_frames
option.
- """
- try:
- ps.set_option("compute.ops_on_diff_frames", True)
-
- # Yield control to the code inside the 'with' block
- yield
- finally:
- ps.reset_option("compute.ops_on_diff_frames")
-
def contains_any_geom_collection(self, geoms1, geoms2) -> bool:
return any(isinstance(g, GeometryCollection) for g in geoms1) or any(
isinstance(g, GeometryCollection) for g in geoms2
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 9dcefc609b..614e4dc112 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -47,6 +47,7 @@ from packaging.version import parse as parse_version
)
class TestGeoSeries(TestGeopandasBase):
def setup_method(self):
+ super().setup_method()
self.geoseries = sgpd.GeoSeries(
[
Point(2.3, -1),
@@ -500,9 +501,7 @@ class TestGeoSeries(TestGeopandasBase):
def test_to_json(self):
s = GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)])
- # TODO: optimize this away
- with self.ps_allow_diff_frames():
- result = s.to_json()
+ result = s.to_json()
expected = '{"type": "FeatureCollection", "features": [{"id": "0",
"type": "Feature", "pr\
operties": {}, "geometry": {"type": "Point", "coordinates": [1.0, 1.0]},
"bbox": [1.0,\
1.0, 1.0, 1.0]}, {"id": "1", "type": "Feature", "properties": {}, "geometry":
{"type"\
@@ -512,20 +511,17 @@ e": "Feature", "properties": {}, "geometry": {"type":
"Point", "coordinates": [3
assert result == expected
- with self.ps_allow_diff_frames():
- result = s.to_json(show_bbox=True)
- expected = '{"type": "FeatureCollection", "features": [{"id": "0",
"type": "Feature", "properties": {}, "geometry": {"type": "Point",
"coordinates": [1.0, 1.0]}, "bbox": [1.0, 1.0, 1.0, 1.0]}, {"id": "1", "type":
"Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [2.0,
2.0]}, "bbox": [2.0, 2.0, 2.0, 2.0]}, {"id": "2", "type": "Feature",
"properties": {}, "geometry": {"type": "Point", "coordinates": [3.0, 3.0]},
"bbox": [3.0, 3.0, 3.0, 3.0]}], "bbox": [1 [...]
- assert result == expected
+ result = s.to_json(show_bbox=True)
+ expected = '{"type": "FeatureCollection", "features": [{"id": "0",
"type": "Feature", "properties": {}, "geometry": {"type": "Point",
"coordinates": [1.0, 1.0]}, "bbox": [1.0, 1.0, 1.0, 1.0]}, {"id": "1", "type":
"Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [2.0,
2.0]}, "bbox": [2.0, 2.0, 2.0, 2.0]}, {"id": "2", "type": "Feature",
"properties": {}, "geometry": {"type": "Point", "coordinates": [3.0, 3.0]},
"bbox": [3.0, 3.0, 3.0, 3.0]}], "bbox": [1.0, [...]
+ assert result == expected
- with self.ps_allow_diff_frames():
- result = s.to_json(drop_id=True)
- expected = '{"type": "FeatureCollection", "features": [{"type":
"Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [1.0,
1.0]}, "bbox": [1.0, 1.0, 1.0, 1.0]}, {"type": "Feature", "properties": {},
"geometry": {"type": "Point", "coordinates": [2.0, 2.0]}, "bbox": [2.0, 2.0,
2.0, 2.0]}, {"type": "Feature", "properties": {}, "geometry": {"type": "Point",
"coordinates": [3.0, 3.0]}, "bbox": [3.0, 3.0, 3.0, 3.0]}], "bbox": [1.0, 1.0,
3.0, 3.0]}'
- assert result == expected
+ result = s.to_json(drop_id=True)
+ expected = '{"type": "FeatureCollection", "features": [{"type":
"Feature", "properties": {}, "geometry": {"type": "Point", "coordinates": [1.0,
1.0]}, "bbox": [1.0, 1.0, 1.0, 1.0]}, {"type": "Feature", "properties": {},
"geometry": {"type": "Point", "coordinates": [2.0, 2.0]}, "bbox": [2.0, 2.0,
2.0, 2.0]}, {"type": "Feature", "properties": {}, "geometry": {"type": "Point",
"coordinates": [3.0, 3.0]}, "bbox": [3.0, 3.0, 3.0, 3.0]}], "bbox": [1.0, 1.0,
3.0, 3.0]}'
+ assert result == expected
- with self.ps_allow_diff_frames():
- result = s.set_crs("EPSG:3857").to_json(to_wgs84=True)
- expected = '{"type": "FeatureCollection", "features": [{"id": "0",
"type": "Feature", "properties": {}, "geometry": {"type": "Point",
"coordinates": [8.983152841195214e-06, 8.983152841195177e-06]}, "bbox":
[8.983152841195214e-06, 8.983152841195177e-06, 8.983152841195214e-06,
8.983152841195177e-06]}, {"id": "1", "type": "Feature", "properties": {},
"geometry": {"type": "Point", "coordinates": [1.7966305682390428e-05,
1.7966305682390134e-05]}, "bbox": [1.7966305682390428e-05, 1 [...]
- assert result == expected
+ result = s.set_crs("EPSG:3857").to_json(to_wgs84=True)
+ expected = '{"type": "FeatureCollection", "features": [{"id": "0",
"type": "Feature", "properties": {}, "geometry": {"type": "Point",
"coordinates": [8.983152841195214e-06, 8.983152841195177e-06]}, "bbox":
[8.983152841195214e-06, 8.983152841195177e-06, 8.983152841195214e-06,
8.983152841195177e-06]}, {"id": "1", "type": "Feature", "properties": {},
"geometry": {"type": "Point", "coordinates": [1.7966305682390428e-05,
1.7966305682390134e-05]}, "bbox": [1.7966305682390428e-05, 1.796 [...]
+ assert result == expected
def test_to_wkb(self):
if parse_version(shapely.__version__) < parse_version("2.0.0"):
@@ -603,9 +599,7 @@ e": "Feature", "properties": {}, "geometry": {"type":
"Point", "coordinates": [3
import pyarrow as pa
gser = GeoSeries([Point(1, 2), Point(2, 1)])
- # TODO: optimize this away
- with self.ps_allow_diff_frames():
- arrow_array = gser.to_arrow()
+ arrow_array = gser.to_arrow()
result = pa.array(arrow_array)
expected = [
@@ -1614,7 +1608,7 @@ e": "Feature", "properties": {}, "geometry": {"type":
"Point", "coordinates": [3
# Ensure the index is preserved when crs is set (previously an issue)
expected_index = ps.Index(range(1, 6))
- with self.ps_allow_diff_frames():
+ with ps.option_context("compute.ops_on_diff_frames", True):
assert s2.index.equals(expected_index)
result = s.intersection(s2, align=True)
diff --git a/python/tests/geopandas/test_match_geopandas_dataframe.py
b/python/tests/geopandas/test_match_geopandas_dataframe.py
index b411065035..3b0b8eff42 100644
--- a/python/tests/geopandas/test_match_geopandas_dataframe.py
+++ b/python/tests/geopandas/test_match_geopandas_dataframe.py
@@ -132,8 +132,7 @@ class TestMatchGeopandasDataFrame(TestGeopandasBase):
sgpd_df = GeoDataFrame(self.geometries)
gpd_df = gpd.GeoDataFrame(self.geometries)
- with self.ps_allow_diff_frames():
- sgpd_df = sgpd_df.set_geometry("points")
+ sgpd_df = sgpd_df.set_geometry("points")
gpd_df = gpd_df.set_geometry("points")
assert sgpd_df.geometry.name == gpd_df.geometry.name
@@ -146,8 +145,7 @@ class TestMatchGeopandasDataFrame(TestGeopandasBase):
sgpd_df = GeoDataFrame(self.geometries)
gpd_df = gpd.GeoDataFrame(self.geometries)
- with self.ps_allow_diff_frames():
- sgpd_df = sgpd_df.set_geometry("polygons")
+ sgpd_df = sgpd_df.set_geometry("polygons")
gpd_df = gpd_df.set_geometry("polygons")
assert sgpd_df.geometry.name == gpd_df.geometry.name
assert (
@@ -160,20 +158,17 @@ class TestMatchGeopandasDataFrame(TestGeopandasBase):
sgpd_df = GeoDataFrame(self.geometries)
gpd_df = gpd.GeoDataFrame(self.geometries)
- with self.ps_allow_diff_frames():
- sgpd_df = sgpd_df.set_geometry("polygons")
+ sgpd_df = sgpd_df.set_geometry("polygons")
gpd_df = gpd_df.set_geometry("polygons")
assert sgpd_df.geometry.name == gpd_df.geometry.name
# test inplace
- with self.ps_allow_diff_frames():
- sgpd_df.rename_geometry("random", inplace=True)
+ sgpd_df.rename_geometry("random", inplace=True)
gpd_df.rename_geometry("random", inplace=True)
assert sgpd_df.geometry.name == gpd_df.geometry.name
# Ensure the names are different when we rename to different names
- with self.ps_allow_diff_frames():
- sgpd_df = sgpd_df.rename_geometry("name1")
+ sgpd_df = sgpd_df.rename_geometry("name1")
gpd_df = gpd_df.rename_geometry("name2")
assert sgpd_df.geometry.name != gpd_df.geometry.name
@@ -205,9 +200,7 @@ class TestMatchGeopandasDataFrame(TestGeopandasBase):
]
for data in tests:
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
- sgpd_result = GeoDataFrame(data).to_json()
+ sgpd_result = GeoDataFrame(data).to_json()
gpd_result = gpd.GeoDataFrame(data).to_json()
assert sgpd_result == gpd_result
@@ -226,8 +219,8 @@ class TestMatchGeopandasDataFrame(TestGeopandasBase):
{"na": "drop", "show_bbox": True, "drop_id": True, "to_wgs84":
True},
]
for kwargs in tests:
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
+ # TODO: Try to optimize this 'with
ps.option_context("compute.ops_on_diff_frames", True)' away
+ with ps.option_context("compute.ops_on_diff_frames", True):
sgpd_result = GeoDataFrame(data,
crs="EPSG:3857").to_json(**kwargs)
gpd_result = gpd.GeoDataFrame(data,
crs="EPSG:3857").to_json(**kwargs)
assert sgpd_result == gpd_result
@@ -250,9 +243,7 @@ class TestMatchGeopandasDataFrame(TestGeopandasBase):
}
)
- # TODO: optimize this away
- with self.ps_allow_diff_frames():
- sgpd_result = GeoDataFrame.from_arrow(gdf.to_arrow())
+ sgpd_result = GeoDataFrame.from_arrow(gdf.to_arrow())
gpd_result = gpd.GeoDataFrame.from_arrow(gdf.to_arrow())
self.check_sgpd_df_equals_gpd_df(sgpd_result, gpd_result)
@@ -269,20 +260,16 @@ class TestMatchGeopandasDataFrame(TestGeopandasBase):
"geometry": [Point(1, 2), Point(2, 1), LineString([(0, 0), (1,
1)])],
}
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
- sgpd_result = pa.table(GeoDataFrame(data).to_arrow(index=False))
+ sgpd_result = pa.table(GeoDataFrame(data).to_arrow(index=False))
gpd_result = pa.table(gpd.GeoDataFrame(data).to_arrow(index=False))
assert sgpd_result.equals(gpd_result)
- # TODO: Try to optimize this with self.ps_allow_diff_frames() away
- with self.ps_allow_diff_frames():
- sgpd_result = pa.table(
- GeoDataFrame(
- data, index=pd.RangeIndex(start=0, stop=3, step=1)
- ).to_arrow(index=True)
+ sgpd_result = pa.table(
+ GeoDataFrame(data, index=pd.RangeIndex(start=0, stop=3,
step=1)).to_arrow(
+ index=True
)
+ )
gpd_result = pa.table(
gpd.GeoDataFrame(
data, index=pd.RangeIndex(start=0, stop=3, step=1)