(sedona) branch master updated: [GH-2007] Geopandas.Dataframe: Fix constructor for pandas-on-pyspark and Sedona Geopandas input types (#2008)

jiayu Thu, 26 Jun 2025 12:12:48 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new 70967cb963 [GH-2007] Geopandas.Dataframe: Fix constructor for 
pandas-on-pyspark and Sedona Geopandas input types (#2008)
70967cb963 is described below

commit 70967cb963e2f4302fdbd1db8d65e3255e3dce14
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Jun 26 12:12:04 2025 -0700

    [GH-2007] Geopandas.Dataframe: Fix constructor for pandas-on-pyspark and 
Sedona Geopandas input types (#2008)
    
    * Fix small constructor bug
    
    * Fix condition for converting to wkb
    
    * Fix constructor to not error on sgpd and pspd inputs
    
    * Add constructor tests for all input types, including non-geometry
    
    * pre-commit reformat
    
    * Change to BaseGeometry for shapely compatibilty
    
    * pre-commit fmt
    
    * Remove empty lst and dct test cases since diff spark versions handle 
differently
---
 python/sedona/geopandas/geodataframe.py     | 14 +++---
 python/tests/geopandas/test_geodataframe.py | 77 +++++++++++++++++++++++++----
 2 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/python/sedona/geopandas/geodataframe.py 
b/python/sedona/geopandas/geodataframe.py
index a3b1db624c..a2f90dff4b 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -141,15 +141,12 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         if isinstance(data, (GeoDataFrame, GeoSeries)):
             assert dtype is None
             assert not copy
-            self._anchor = data
-            self._col_label = index
+            super().__init__(data, index=index, dtype=dtype, copy=copy)
         elif isinstance(data, (PandasOnSparkSeries, PandasOnSparkDataFrame)):
             assert columns is None
             assert dtype is None
             assert not copy
-            if index is None:
-                internal = 
InternalFrame(spark_frame=data._internal.spark_frame)
-                object.__setattr__(self, "_internal_frame", internal)
+            super().__init__(data, index=index, dtype=dtype)
         elif isinstance(data, SparkDataFrame):
             assert columns is None
             assert dtype is None
@@ -173,8 +170,13 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                 )
             gdf = gpd.GeoDataFrame(df)
             # convert each geometry column to wkb type
+            import shapely
+
             for col in gdf.columns:
-                if isinstance(gdf[col], gpd.GeoSeries):
+                # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, 
etc of shapely.Geometry objects.
+                if len(gdf[col]) > 0 and isinstance(
+                    gdf[col].iloc[0], shapely.geometry.base.BaseGeometry
+                ):
                     gdf[col] = gdf[col].apply(lambda geom: geom.wkb)
             pdf = pd.DataFrame(gdf)
             # initialize the parent class pyspark Dataframe with the pandas 
Series
diff --git a/python/tests/geopandas/test_geodataframe.py 
b/python/tests/geopandas/test_geodataframe.py
index b236581255..33e0041dc2 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -21,9 +21,14 @@ from shapely.geometry import (
     Point,
 )
 
-from sedona.geopandas import GeoDataFrame
+from sedona.geopandas import GeoDataFrame, GeoSeries
 from tests.test_base import TestBase
 import pyspark.pandas as ps
+import pandas as pd
+import geopandas as gpd
+import sedona.geopandas as sgpd
+import pytest
+from pandas.testing import assert_frame_equal
 
 
 class TestDataframe(TestBase):
@@ -41,10 +46,52 @@ class TestDataframe(TestBase):
     #
     # def teardown_method(self):
     #     shutil.rmtree(self.tempdir)
-
-    def test_constructor(self):
-        df = GeoDataFrame([Point(x, x) for x in range(3)])
-        check_geodataframe(df)
+    @pytest.mark.parametrize(
+        "obj",
+        [
+            [Point(x, x) for x in range(3)],
+            {"geometry": [Point(x, x) for x in range(3)]},
+            pd.DataFrame([Point(x, x) for x in range(3)]),
+            gpd.GeoDataFrame([Point(x, x) for x in range(3)]),
+            pd.Series([Point(x, x) for x in range(3)]),
+            gpd.GeoSeries([Point(x, x) for x in range(3)]),
+            GeoSeries([Point(x, x) for x in range(3)]),
+            GeoDataFrame([Point(x, x) for x in range(3)]),
+        ],
+    )
+    def test_constructor(self, obj):
+        sgpd_df = GeoDataFrame(obj)
+        check_geodataframe(sgpd_df)
+
+    def test_constructor_pandas_on_spark(self):
+        for obj in [
+            ps.DataFrame([Point(x, x) for x in range(3)]),
+            ps.Series([Point(x, x) for x in range(3)]),
+        ]:
+            sgpd_df = GeoDataFrame(obj)
+            check_geodataframe(sgpd_df)
+
+    @pytest.mark.parametrize(
+        "obj",
+        [
+            [0, 1, 2],
+            ["x", "y", "z"],
+            {"a": [0, 1, 2], 1: [4, 5, 6]},
+            {"a": ["x", "y", "z"], 1: ["a", "b", "c"]},
+            pd.Series([0, 1, 2]),
+            pd.Series(["x", "y", "z"]),
+            pd.DataFrame({"x": ["x", "y", "z"]}),
+            gpd.GeoDataFrame({"x": [0, 1, 2]}),
+            ps.DataFrame({"x": ["x", "y", "z"]}),
+        ],
+    )
+    def test_non_geometry(self, obj):
+        pd_df = pd.DataFrame(obj)
+        # pd.DataFrame(obj) doesn't work correctly for pandas on spark 
DataFrame type, so we use to_pandas() method instead.
+        if isinstance(obj, ps.DataFrame):
+            pd_df = obj.to_pandas()
+        sgpd_df = sgpd.GeoDataFrame(obj)
+        assert_frame_equal(pd_df, sgpd_df.to_pandas())
 
     def test_psdf(self):
         # this is to make sure the spark session works with pandas on spark api
@@ -73,7 +120,10 @@ class TestDataframe(TestBase):
 
         # Assert the geometry column has the correct type and is not nullable
         geometry_field = schema["geometry1"]
-        assert geometry_field.dataType.typeName() == "geometrytype"
+        assert (
+            geometry_field.dataType.typeName() == "geometrytype"
+            or geometry_field.dataType.typeName() == "binary"
+        )
         assert not geometry_field.nullable
 
         # Assert non-geometry columns are present with correct types
@@ -97,16 +147,25 @@ class TestDataframe(TestBase):
         schema = df._internal.spark_frame.schema
         # Assert both geometry columns have the correct type
         geometry_field1 = schema["geometry1"]
-        assert geometry_field1.dataType.typeName() == "geometrytype"
+        assert (
+            geometry_field1.dataType.typeName() == "geometrytype"
+            or geometry_field1.dataType.typeName() == "binary"
+        )
         assert not geometry_field1.nullable
 
         geometry_field2 = schema["geometry2"]
-        assert geometry_field2.dataType.typeName() == "geometrytype"
+        assert (
+            geometry_field2.dataType.typeName() == "geometrytype"
+            or geometry_field2.dataType.typeName() == "binary"
+        )
         assert not geometry_field2.nullable
 
         # Check non-geometry column
         attribute_field = schema["attribute"]
-        assert attribute_field.dataType.typeName() != "geometrytype"
+        assert (
+            attribute_field.dataType.typeName() != "geometrytype"
+            and attribute_field.dataType.typeName() != "binary"
+        )
 
     def test_copy(self):
         df = GeoDataFrame([Point(x, x) for x in range(3)], name="test_df")

(sedona) branch master updated: [GH-2007] Geopandas.Dataframe: Fix constructor for pandas-on-pyspark and Sedona Geopandas input types (#2008)

Reply via email to