This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 70967cb963 [GH-2007] Geopandas.Dataframe: Fix constructor for
pandas-on-pyspark and Sedona Geopandas input types (#2008)
70967cb963 is described below
commit 70967cb963e2f4302fdbd1db8d65e3255e3dce14
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Jun 26 12:12:04 2025 -0700
[GH-2007] Geopandas.Dataframe: Fix constructor for pandas-on-pyspark and
Sedona Geopandas input types (#2008)
* Fix small constructor bug
* Fix condition for converting to wkb
* Fix constructor to not error on sgpd and pspd inputs
* Add constructor tests for all input types, including non-geometry
* pre-commit reformat
* Change to BaseGeometry for shapely compatibilty
* pre-commit fmt
* Remove empty lst and dct test cases since diff spark versions handle
differently
---
python/sedona/geopandas/geodataframe.py | 14 +++---
python/tests/geopandas/test_geodataframe.py | 77 +++++++++++++++++++++++++----
2 files changed, 76 insertions(+), 15 deletions(-)
diff --git a/python/sedona/geopandas/geodataframe.py
b/python/sedona/geopandas/geodataframe.py
index a3b1db624c..a2f90dff4b 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -141,15 +141,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
if isinstance(data, (GeoDataFrame, GeoSeries)):
assert dtype is None
assert not copy
- self._anchor = data
- self._col_label = index
+ super().__init__(data, index=index, dtype=dtype, copy=copy)
elif isinstance(data, (PandasOnSparkSeries, PandasOnSparkDataFrame)):
assert columns is None
assert dtype is None
assert not copy
- if index is None:
- internal =
InternalFrame(spark_frame=data._internal.spark_frame)
- object.__setattr__(self, "_internal_frame", internal)
+ super().__init__(data, index=index, dtype=dtype)
elif isinstance(data, SparkDataFrame):
assert columns is None
assert dtype is None
@@ -173,8 +170,13 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
)
gdf = gpd.GeoDataFrame(df)
# convert each geometry column to wkb type
+ import shapely
+
for col in gdf.columns:
- if isinstance(gdf[col], gpd.GeoSeries):
+ # It's possible we get a list, dict, pd.Series, gpd.GeoSeries,
etc of shapely.Geometry objects.
+ if len(gdf[col]) > 0 and isinstance(
+ gdf[col].iloc[0], shapely.geometry.base.BaseGeometry
+ ):
gdf[col] = gdf[col].apply(lambda geom: geom.wkb)
pdf = pd.DataFrame(gdf)
# initialize the parent class pyspark Dataframe with the pandas
Series
diff --git a/python/tests/geopandas/test_geodataframe.py
b/python/tests/geopandas/test_geodataframe.py
index b236581255..33e0041dc2 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -21,9 +21,14 @@ from shapely.geometry import (
Point,
)
-from sedona.geopandas import GeoDataFrame
+from sedona.geopandas import GeoDataFrame, GeoSeries
from tests.test_base import TestBase
import pyspark.pandas as ps
+import pandas as pd
+import geopandas as gpd
+import sedona.geopandas as sgpd
+import pytest
+from pandas.testing import assert_frame_equal
class TestDataframe(TestBase):
@@ -41,10 +46,52 @@ class TestDataframe(TestBase):
#
# def teardown_method(self):
# shutil.rmtree(self.tempdir)
-
- def test_constructor(self):
- df = GeoDataFrame([Point(x, x) for x in range(3)])
- check_geodataframe(df)
+ @pytest.mark.parametrize(
+ "obj",
+ [
+ [Point(x, x) for x in range(3)],
+ {"geometry": [Point(x, x) for x in range(3)]},
+ pd.DataFrame([Point(x, x) for x in range(3)]),
+ gpd.GeoDataFrame([Point(x, x) for x in range(3)]),
+ pd.Series([Point(x, x) for x in range(3)]),
+ gpd.GeoSeries([Point(x, x) for x in range(3)]),
+ GeoSeries([Point(x, x) for x in range(3)]),
+ GeoDataFrame([Point(x, x) for x in range(3)]),
+ ],
+ )
+ def test_constructor(self, obj):
+ sgpd_df = GeoDataFrame(obj)
+ check_geodataframe(sgpd_df)
+
+ def test_constructor_pandas_on_spark(self):
+ for obj in [
+ ps.DataFrame([Point(x, x) for x in range(3)]),
+ ps.Series([Point(x, x) for x in range(3)]),
+ ]:
+ sgpd_df = GeoDataFrame(obj)
+ check_geodataframe(sgpd_df)
+
+ @pytest.mark.parametrize(
+ "obj",
+ [
+ [0, 1, 2],
+ ["x", "y", "z"],
+ {"a": [0, 1, 2], 1: [4, 5, 6]},
+ {"a": ["x", "y", "z"], 1: ["a", "b", "c"]},
+ pd.Series([0, 1, 2]),
+ pd.Series(["x", "y", "z"]),
+ pd.DataFrame({"x": ["x", "y", "z"]}),
+ gpd.GeoDataFrame({"x": [0, 1, 2]}),
+ ps.DataFrame({"x": ["x", "y", "z"]}),
+ ],
+ )
+ def test_non_geometry(self, obj):
+ pd_df = pd.DataFrame(obj)
+ # pd.DataFrame(obj) doesn't work correctly for pandas on spark
DataFrame type, so we use to_pandas() method instead.
+ if isinstance(obj, ps.DataFrame):
+ pd_df = obj.to_pandas()
+ sgpd_df = sgpd.GeoDataFrame(obj)
+ assert_frame_equal(pd_df, sgpd_df.to_pandas())
def test_psdf(self):
# this is to make sure the spark session works with pandas on spark api
@@ -73,7 +120,10 @@ class TestDataframe(TestBase):
# Assert the geometry column has the correct type and is not nullable
geometry_field = schema["geometry1"]
- assert geometry_field.dataType.typeName() == "geometrytype"
+ assert (
+ geometry_field.dataType.typeName() == "geometrytype"
+ or geometry_field.dataType.typeName() == "binary"
+ )
assert not geometry_field.nullable
# Assert non-geometry columns are present with correct types
@@ -97,16 +147,25 @@ class TestDataframe(TestBase):
schema = df._internal.spark_frame.schema
# Assert both geometry columns have the correct type
geometry_field1 = schema["geometry1"]
- assert geometry_field1.dataType.typeName() == "geometrytype"
+ assert (
+ geometry_field1.dataType.typeName() == "geometrytype"
+ or geometry_field1.dataType.typeName() == "binary"
+ )
assert not geometry_field1.nullable
geometry_field2 = schema["geometry2"]
- assert geometry_field2.dataType.typeName() == "geometrytype"
+ assert (
+ geometry_field2.dataType.typeName() == "geometrytype"
+ or geometry_field2.dataType.typeName() == "binary"
+ )
assert not geometry_field2.nullable
# Check non-geometry column
attribute_field = schema["attribute"]
- assert attribute_field.dataType.typeName() != "geometrytype"
+ assert (
+ attribute_field.dataType.typeName() != "geometrytype"
+ and attribute_field.dataType.typeName() != "binary"
+ )
def test_copy(self):
df = GeoDataFrame([Point(x, x) for x in range(3)], name="test_df")