(sedona) branch master updated: [GH-2095] Geopandas: Implement Scalable workaround and store as geometries instead of EWKB (#2121)

jiayu Fri, 18 Jul 2025 23:24:14 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new f9ee970853 [GH-2095] Geopandas: Implement Scalable workaround and 
store as geometries instead of EWKB (#2121)
f9ee970853 is described below

commit f9ee970853736bd434475f417d259313fd8b25cf
Author: Peter Nguyen <[email protected]>
AuthorDate: Fri Jul 18 23:24:02 2025 -0700

    [GH-2095] Geopandas: Implement Scalable workaround and store as geometries 
instead of EWKB (#2121)
    
    * Implement scalable workaround by using native sedona geometries instead 
of ewkb, and fixing up the tests
    
    * Support empty_list input and use first_valid_index instead of 0 for 
get_srid
    
    * Skip ALL geopandas tests for shapely version 1 (require get_srid method)
---
 python/sedona/geopandas/geodataframe.py            | 57 ++++-----------
 python/sedona/geopandas/geoseries.py               | 85 ++++++----------------
 python/tests/geopandas/test_geodataframe.py        | 28 ++++---
 python/tests/geopandas/test_geoseries.py           | 29 +++++---
 .../geopandas/test_match_geopandas_dataframe.py    |  5 ++
 .../tests/geopandas/test_match_geopandas_series.py | 11 ++-
 python/tests/geopandas/test_sjoin.py               |  6 ++
 7 files changed, 89 insertions(+), 132 deletions(-)

diff --git a/python/sedona/geopandas/geodataframe.py 
b/python/sedona/geopandas/geodataframe.py
index 6e80270698..f4da65207d 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -348,8 +348,17 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
 
             try:
                 result = sgpd.GeoSeries(ps_series)
+                first_idx = ps_series.first_valid_index()
+                if first_idx is not None:
+                    geom = ps_series.iloc[int(first_idx)]
+                    srid = shapely.get_srid(geom)
+
+                    # Shapely objects stored in the ps.Series retain their srid
+                    # but the GeoSeries does not, so we manually re-set it here
+                    if srid > 0:
+                        result.set_crs(srid, inplace=True)
                 return result
-            except:
+            except TypeError:
                 return ps_series
 
         # Handle list of column names
@@ -399,36 +408,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         from sedona.geopandas import GeoSeries
         from pyspark.sql import DataFrame as SparkDataFrame
 
-        # Simplified version of the function from GeoSeries.__init__()
-        def try_geom_to_ewkb(x) -> bytes:
-            if isinstance(x, BaseGeometry):
-                kwargs = {}
-                if crs:
-                    from pyproj import CRS
-
-                    srid = CRS.from_user_input(crs)
-                    kwargs["srid"] = srid.to_epsg()
-
-                return shapely.wkb.dumps(x, **kwargs)
-            elif isinstance(x, bytearray):
-                return bytes(x)
-            elif x is None or isinstance(x, bytes):
-                return x
-            else:
-                raise TypeError(f"expected geometry or bytes, got {type(x)}: 
{x}")
-
-        def safe_apply_to_each_column(
-            df: pd.DataFrame | pspd.DataFrame, func: Callable
-        ):
-            # try except won't work here for ps.DataFrame case, so we use 
first_valid_index() instead
-            for col in df.columns:
-                first_idx = df[col].first_valid_index()
-                if first_idx is not None and isinstance(
-                    df[col][first_idx], BaseGeometry
-                ):
-                    df[col] = df[col].apply(try_geom_to_ewkb)
-            return df
-
         if isinstance(data, GeoDataFrame):
             if data._safe_get_crs() is None:
                 data.crs = crs
@@ -442,8 +421,6 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
             super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
         elif isinstance(data, PandasOnSparkDataFrame):
 
-            data = safe_apply_to_each_column(data, try_geom_to_ewkb)
-
             super().__init__(data, index=index, columns=columns, dtype=dtype, 
copy=copy)
         elif isinstance(data, PandasOnSparkSeries):
 
@@ -475,17 +452,13 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
                     dtype=dtype,
                     copy=copy,
                 )
-            gdf = gpd.GeoDataFrame(df)
-            # convert each geometry column to wkb type
-
-            # It's possible we get a list, dict, pd.Series, gpd.GeoSeries, etc 
of shapely.Geometry objects.
-            gdf = safe_apply_to_each_column(gdf, try_geom_to_ewkb)
 
-            pdf = pd.DataFrame(gdf)
+            # Spark complains if it's left as a geometry type
+            pd_df = df.astype(object)
 
-            # initialize the parent class pyspark Dataframe with the pandas 
Series
+            # initialize the parent class pyspark Dataframe with the pandas 
Dataframe
             super().__init__(
-                data=pdf,
+                data=pd_df,
                 index=index,
                 dtype=dtype,
                 copy=copy,
@@ -949,7 +922,7 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
         0           1.0      1
         1           4.0      2
         """
-        return self._process_geometry_columns("ST_Area", rename_suffix="_area")
+        return self.geometry.area
 
     def _safe_get_crs(self):
         """
diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index 0075b82c0e..e9fe4d3795 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -28,7 +28,8 @@ from pyspark.pandas.frame import DataFrame as 
PandasOnSparkDataFrame
 from pyspark.pandas.internal import InternalFrame
 from pyspark.pandas.series import first_series
 from pyspark.pandas.utils import scol_for, log_advice
-from pyspark.sql.types import BinaryType
+from pyspark.sql.types import BinaryType, NullType
+from sedona.spark.sql.types import GeometryType
 
 import shapely
 from shapely.geometry.base import BaseGeometry
@@ -378,28 +379,6 @@ class GeoSeries(GeoFrame, pspd.Series):
         self._anchor: GeoDataFrame
         self._col_label: Label
 
-        use_same_anchor = True
-
-        def try_geom_to_ewkb(x) -> bytes:
-            nonlocal use_same_anchor
-            if isinstance(x, BaseGeometry):
-                kwargs = {}
-                if crs:
-                    from pyproj import CRS
-
-                    srid = CRS.from_user_input(crs)
-                    kwargs["srid"] = srid.to_epsg()
-                use_same_anchor = False
-
-                return shapely.wkb.dumps(x, **kwargs)
-            elif isinstance(x, bytearray):
-                use_same_anchor = False
-                return bytes(x)
-            elif x is None or isinstance(x, bytes):
-                return x
-            else:
-                raise TypeError(f"expected geometry or bytes, got {type(x)}: 
{x}")
-
         if isinstance(
             data, (GeoDataFrame, GeoSeries, PandasOnSparkSeries, 
PandasOnSparkDataFrame)
         ):
@@ -418,29 +397,21 @@ class GeoSeries(GeoFrame, pspd.Series):
                     "allow_override=True)' to overwrite CRS or "
                     "'GeoSeries.to_crs(crs)' to reproject geometries. "
                 )
-            # This is a temporary workaround since pyspark errors when 
creating a ps.Series from a ps.Series
-            # This is NOT a scalable solution since we call to_pandas() on the 
data and is a hacky solution
-            # but this should be resolved if/once 
https://github.com/apache/spark/pull/51300 is merged in.
-            # For now, we reset self._anchor = data to have keep the geometry 
information (e.g crs) that's lost in to_pandas()
 
-            pd_data = data.to_pandas()
+            # PySpark Pandas' ps.Series.__init__() does not construction from a
+            # ps.Series input. For now, we manually implement the logic.
 
-            try:
-                pd_data = pd_data.apply(try_geom_to_ewkb)
-            except Exception as e:
-                raise TypeError(f"Non-geometry column passed to GeoSeries: 
{e}")
+            index = data._col_label if index is None else index
+            ps_df = pspd.DataFrame(data._anchor)
 
             super().__init__(
-                data=pd_data,
+                data=ps_df,
                 index=index,
                 dtype=dtype,
                 name=name,
                 copy=copy,
                 fastpath=fastpath,
             )
-
-            if use_same_anchor:
-                self._anchor = data
         else:
             if isinstance(data, pd.Series):
                 assert index is None
@@ -448,9 +419,9 @@ class GeoSeries(GeoFrame, pspd.Series):
                 assert name is None
                 assert not copy
                 assert not fastpath
-                s = data
+                pd_series = data
             else:
-                s = pd.Series(
+                pd_series = pd.Series(
                     data=data,
                     index=index,
                     dtype=dtype,
@@ -459,21 +430,24 @@ class GeoSeries(GeoFrame, pspd.Series):
                     fastpath=fastpath,
                 )
 
-            try:
-                pd_series = s.apply(try_geom_to_ewkb)
-            except Exception as e:
-                raise TypeError(f"Non-geometry column passed to GeoSeries: 
{e}")
-
             # initialize the parent class pyspark Series with the pandas Series
             super().__init__(data=pd_series)
 
-        # manually set it to binary type
+        # Ensure we're storing geometry types
         col = next(
             field.name
             for field in self._internal.spark_frame.schema.fields
             if field.name not in (NATURAL_ORDER_COLUMN_NAME, 
SPARK_DEFAULT_INDEX_NAME)
         )
-        self._internal.spark_frame.schema[col].dataType = BinaryType()
+        datatype = self._internal.spark_frame.schema[col].dataType
+        # Empty lists input will lead to NullType(), so we convert to 
GeometryType()
+        if datatype == NullType():
+            self._internal.spark_frame.schema[col].dataType = GeometryType()
+        elif datatype != GeometryType():
+            raise TypeError(
+                "Non geometry data passed to GeoSeries constructor, "
+                f"received data of dtype '{datatype.typeName()}'"
+            )
 
         if crs:
             self.set_crs(crs, inplace=True)
@@ -522,6 +496,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         tmp = self._process_geometry_column("ST_SRID", rename="crs", 
returns_geom=False)
         ps_series = tmp.take([0])
         srid = ps_series.iloc[0]
+
         # Sedona returns 0 if doesn't exist
         return CRS.from_user_input(srid) if srid != 0 and not pd.isna(srid) 
else None
 
@@ -774,10 +749,6 @@ class GeoSeries(GeoFrame, pspd.Series):
             # must have rename for multiple columns since we don't know which 
name to default to
             assert rename
 
-        # Convert back to EWKB format if the return type is a geometry
-        if returns_geom:
-            query = f"ST_AsEWKB({query})"
-
         query = f"{query} as `{rename}`"
 
         exprs = [query]
@@ -831,20 +802,10 @@ class GeoSeries(GeoFrame, pspd.Series):
         Same as `to_geopandas()`, without issuing the advice log for internal 
usage.
         """
         pd_series = self._to_internal_pandas()
-        try:
-            return gpd.GeoSeries(
-                pd_series.map(
-                    lambda wkb: (
-                        shapely.wkb.loads(bytes(wkb)) if not pd.isna(wkb) else 
None
-                    )
-                ),
-                crs=self.crs,
-            )
-        except TypeError:
-            return gpd.GeoSeries(pd_series, crs=self.crs)
+        return gpd.GeoSeries(pd_series, crs=self.crs)
 
     def to_spark_pandas(self) -> pspd.Series:
-        return pspd.Series(self._psdf._to_internal_pandas())
+        return pspd.Series(pspd.DataFrame(self._psdf._internal))
 
     # 
============================================================================
     # PROPERTIES AND ATTRIBUTES
@@ -3408,7 +3369,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         if isinstance(data, list) and not isinstance(data[0], (tuple, list)):
             data = [(obj,) for obj in data]
 
-        select = f"ST_AsEWKB({select}) as geometry"
+        select = f"{select} as geometry"
 
         if isinstance(data, pspd.Series):
             spark_df = data._internal.spark_frame
diff --git a/python/tests/geopandas/test_geodataframe.py 
b/python/tests/geopandas/test_geodataframe.py
index a6c208f694..c35854e047 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -20,6 +20,7 @@ import tempfile
 from shapely.geometry import (
     Point,
 )
+import shapely
 
 from sedona.geopandas import GeoDataFrame, GeoSeries
 from tests.geopandas.test_geopandas_base import TestGeopandasBase
@@ -32,6 +33,10 @@ from pandas.testing import assert_frame_equal, 
assert_series_equal
 from packaging.version import parse as parse_version
 
 
[email protected](
+    parse_version(shapely.__version__) < parse_version("2.0.0"),
+    reason=f"Tests require shapely>=2.0.0, but found v{shapely.__version__}",
+)
 class TestDataframe(TestGeopandasBase):
     @pytest.mark.parametrize(
         "obj",
@@ -42,18 +47,19 @@ class TestDataframe(TestGeopandasBase):
             gpd.GeoDataFrame([Point(x, x) for x in range(3)]),
             pd.Series([Point(x, x) for x in range(3)]),
             gpd.GeoSeries([Point(x, x) for x in range(3)]),
-            GeoSeries([Point(x, x) for x in range(3)]),
-            GeoDataFrame([Point(x, x) for x in range(3)]),
         ],
     )
     def test_constructor(self, obj):
         sgpd_df = GeoDataFrame(obj)
         check_geodataframe(sgpd_df)
 
+    # These need to be separate to make sure Sedona's Geometry UDTs have been 
registered
     def test_constructor_pandas_on_spark(self):
         for obj in [
             ps.DataFrame([Point(x, x) for x in range(3)]),
             ps.Series([Point(x, x) for x in range(3)]),
+            GeoSeries([Point(x, x) for x in range(3)]),
+            GeoDataFrame([Point(x, x) for x in range(3)]),
         ]:
             sgpd_df = GeoDataFrame(obj)
             check_geodataframe(sgpd_df)
@@ -268,23 +274,15 @@ class TestDataframe(TestGeopandasBase):
         data = {"geometry1": [poly1, poly2], "id": [1, 2], "value": ["a", "b"]}
 
         df = GeoDataFrame(data)
+        df.set_geometry("geometry1", inplace=True)
 
-        # Calculate area
-        area_df = df.area
-
-        # Verify result is a GeoDataFrame
-        assert type(area_df) is GeoDataFrame
-
-        # Verify the geometry column was converted to area values
-        assert "geometry1_area" in area_df.columns
+        area_series = df.area
 
-        # Verify non-geometry columns were preserved
-        assert "id" in area_df.columns
-        assert "value" in area_df.columns
+        assert type(area_series) is ps.Series
 
         # Check the actual area values
-        area_values = area_df["geometry1_area"].to_list()
-        assert len(area_values) == 2
+        area_values = area_series.to_list()
+        assert len(area_series) == 2
         self.assert_almost_equal(area_values[0], 1.0)
         self.assert_almost_equal(area_values[1], 4.0)
 
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index 48d1e11c11..2db3ed317d 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -40,6 +40,10 @@ import pytest
 from packaging.version import parse as parse_version
 
 
[email protected](
+    parse_version(shapely.__version__) < parse_version("2.0.0"),
+    reason=f"Tests require shapely>=2.0.0, but found v{shapely.__version__}",
+)
 class TestGeoSeries(TestGeopandasBase):
     def setup_method(self):
         self.geoseries = sgpd.GeoSeries(
@@ -57,6 +61,10 @@ class TestGeoSeries(TestGeopandasBase):
             ]
         )
 
+    def test_empty_list(self):
+        s = sgpd.GeoSeries([])
+        assert s.count() == 0
+
     def test_area(self):
         result = self.geoseries.area.to_pandas()
         expected = pd.Series([0.0, 0.0, 5.23, 5.23])
@@ -222,22 +230,20 @@ class TestGeoSeries(TestGeopandasBase):
         )
         self.check_sgpd_equals_gpd(result, expected)
 
-        data = [None, Point(0, 0), None]
+        data = [Point(0, 0), None]
         # Ensure filling with np.nan or pd.NA returns None
         import numpy as np
 
         for fill_val in [np.nan, pd.NA]:
             result = GeoSeries(data).fillna(fill_val)
-            expected = gpd.GeoSeries([None, Point(0, 0), None])
+            expected = gpd.GeoSeries([Point(0, 0), None])
             self.check_sgpd_equals_gpd(result, expected)
 
         # Ensure filling with None is empty GeometryColleciton and not None
         # Also check that inplace works
         result = GeoSeries(data)
         result.fillna(None, inplace=True)
-        expected = gpd.GeoSeries(
-            [GeometryCollection(), Point(0, 0), GeometryCollection()]
-        )
+        expected = gpd.GeoSeries([Point(0, 0), GeometryCollection()])
         self.check_sgpd_equals_gpd(result, expected)
 
     def test_explode(self):
@@ -424,7 +430,8 @@ class TestGeoSeries(TestGeopandasBase):
                     ]
                 ),
                 GeometryCollection([Point(0, 0), LineString([(0, 0), (1, 
1)])]),
-                LinearRing([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]),
+                # Errors for LinearRing: issue #2120
+                # LinearRing([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]),
             ]
         )
         result = geoseries.geom_type
@@ -437,7 +444,7 @@ class TestGeoSeries(TestGeopandasBase):
                 "Polygon",
                 "MultiPolygon",
                 "GeometryCollection",
-                "LineString",  # Note: Sedona returns LineString instead of 
LinearRing
+                # "LineString",  # Note: Sedona returns LineString instead of 
LinearRing
             ]
         )
         assert_series_equal(result.to_pandas(), expected)
@@ -524,12 +531,14 @@ class TestGeoSeries(TestGeopandasBase):
             [
                 LineString([(0, 0), (1, 1), (1, -1), (0, 1)]),
                 LineString([(0, 0), (1, 1), (1, -1)]),
-                LinearRing([(0, 0), (1, 1), (1, -1), (0, 1)]),
-                LinearRing([(0, 0), (-1, 1), (-1, -1), (1, -1)]),
+                # Errors for LinearRing: issue #2120
+                # LinearRing([(0, 0), (1, 1), (1, -1), (0, 1)]),
+                # LinearRing([(0, 0), (-1, 1), (-1, -1), (1, -1)]),
             ]
         )
         result = s.is_simple
-        expected = pd.Series([False, True, False, True])
+        expected = pd.Series([False, True])
+        # Removed LinearRing cases: False, True]
         assert_series_equal(result.to_pandas(), expected)
 
     def test_is_ring(self):
diff --git a/python/tests/geopandas/test_match_geopandas_dataframe.py 
b/python/tests/geopandas/test_match_geopandas_dataframe.py
index acc7487431..4dd7714c98 100644
--- a/python/tests/geopandas/test_match_geopandas_dataframe.py
+++ b/python/tests/geopandas/test_match_geopandas_dataframe.py
@@ -18,6 +18,7 @@
 import pytest
 import shutil
 import tempfile
+import shapely
 from shapely.geometry import (
     Point,
     Polygon,
@@ -36,6 +37,10 @@ from tests.geopandas.test_geopandas_base import 
TestGeopandasBase
 import pyspark.pandas as ps
 
 
[email protected](
+    parse_version(shapely.__version__) < parse_version("2.0.0"),
+    reason=f"Tests require shapely>=2.0.0, but found v{shapely.__version__}",
+)
 class TestMatchGeopandasDataFrame(TestGeopandasBase):
     def setup_method(self):
         self.tempdir = tempfile.mkdtemp()
diff --git a/python/tests/geopandas/test_match_geopandas_series.py 
b/python/tests/geopandas/test_match_geopandas_series.py
index e71ec4ce9a..cfc1b587ba 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -41,6 +41,10 @@ import pyspark.pandas as ps
 from packaging.version import parse as parse_version
 
 
[email protected](
+    parse_version(shapely.__version__) < parse_version("2.0.0"),
+    reason=f"Tests require shapely>=2.0.0, but found v{shapely.__version__}",
+)
 class TestMatchGeopandasSeries(TestGeopandasBase):
     def setup_method(self):
         self.tempdir = tempfile.mkdtemp()
@@ -323,7 +327,7 @@ class TestMatchGeopandasSeries(TestGeopandasBase):
             gpd_result = gpd.GeoSeries(geom).fillna()
             self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
 
-        data = [None, None, None, None, Point(0, 1)]
+        data = [Point(1, 1), None, None, None, Point(0, 1)]
         sgpd_result = GeoSeries(data).fillna()
         gpd_result = gpd.GeoSeries(data).fillna()
         self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
@@ -479,8 +483,9 @@ class TestMatchGeopandasSeries(TestGeopandasBase):
             LineString([(0, 0), (0, 0)]),
             LineString([(0, 0), (1, 1), (1, -1), (0, 1)]),
             LineString([(0, 0), (1, 1), (0, 0)]),
-            LinearRing([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]),
-            LinearRing([(0, 0), (-1, 1), (-1, -1), (1, -1)]),
+            # Errors for LinearRing: issue #2120
+            # LinearRing([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]),
+            # LinearRing([(0, 0), (-1, 1), (-1, -1), (1, -1)]),
         ]
         sgpd_result = GeoSeries(data).is_simple
         gpd_result = gpd.GeoSeries(data).is_simple
diff --git a/python/tests/geopandas/test_sjoin.py 
b/python/tests/geopandas/test_sjoin.py
index 7448193507..9dda951f18 100644
--- a/python/tests/geopandas/test_sjoin.py
+++ b/python/tests/geopandas/test_sjoin.py
@@ -17,12 +17,18 @@
 import shutil
 import tempfile
 import pytest
+import shapely
 
 from shapely.geometry import Polygon, Point, LineString
 from sedona.geopandas import GeoSeries, GeoDataFrame, sjoin
 from tests.test_base import TestBase
+from packaging.version import parse as parse_version
 
 
[email protected](
+    parse_version(shapely.__version__) < parse_version("2.0.0"),
+    reason=f"Tests require shapely>=2.0.0, but found v{shapely.__version__}",
+)
 class TestSpatialJoin(TestBase):
     def setup_method(self):
         self.tempdir = tempfile.mkdtemp()

(sedona) branch master updated: [GH-2095] Geopandas: Implement Scalable workaround and store as geometries instead of EWKB (#2121)

Reply via email to