This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new d799f5082c [GH-2004] Geopandas.GeoSeries: Implement Test Framework 
(#2005)
d799f5082c is described below

commit d799f5082ca651891d656183d7851378baaf5dae
Author: Peter Nguyen <[email protected]>
AuthorDate: Wed Jun 25 16:54:02 2025 -0700

    [GH-2004] Geopandas.GeoSeries: Implement Test Framework (#2005)
    
    * Fix small nit in series __repr__()
    
    * Add test_non_geom_fails()
    
    * test_constructor on all different geometry types
    
    * Change Series.area return type to pd.Series to match gpd behavior and add 
area tests
    
    * Fix GeoSeries.to_pandas() and fix refactor tests
    
    * pre-commit
    
    * Test if sgpd_res equals sedona result and gpd result
    
    * Remove run_sedona_sql test
    
    * Rename test_geoseries.py to test_match_geopandas_series.py
    
    * Make area( return ps.Series instead of pd.Series
    
    * Add new test_geoseries to mimic the scala tests
    
    * Use smaller tests for test_geoseries and hard-code expected results
    
    * Remove check_less_precise for version compatibility
---
 python/sedona/geopandas/geoseries.py               |  25 ++-
 python/tests/geopandas/test_geoseries.py           | 136 ++++--------
 .../tests/geopandas/test_match_geopandas_series.py | 235 +++++++++++++++++++++
 3 files changed, 292 insertions(+), 104 deletions(-)

diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index e17e9817a9..b9b914aa45 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -50,8 +50,8 @@ class GeoSeries(GeoFrame, pspd.Series):
         Return a string representation of the GeoSeries in WKT format.
         """
         try:
-            pandas_series = self.to_geopandas()
-            return gpd.GeoSeries(pandas_series).__repr__()
+            gpd_series = self.to_geopandas()
+            return gpd_series.__repr__()
 
         except Exception as e:
             # Fallback to parent's representation if conversion fails
@@ -176,7 +176,7 @@ class GeoSeries(GeoFrame, pspd.Series):
             A GeoSeries with the operation applied to the geometry column.
         """
         # Find the first column with BinaryType or GeometryType
-        first_col = self.get_first_geometry_column()
+        first_col = self.get_first_geometry_column()  # TODO: fixme
 
         if first_col:
             data_type = self._internal.spark_frame.schema[first_col].dataType
@@ -230,9 +230,16 @@ class GeoSeries(GeoFrame, pspd.Series):
         return self._to_geopandas()
 
     def _to_geopandas(self) -> gpd.GeoSeries:
-        return gpd.GeoSeries(
-            self._to_internal_pandas().map(lambda wkb: 
shapely.wkb.loads(bytes(wkb)))
-        )
+        pd_series = self._to_internal_pandas()
+        try:
+            return gpd.GeoSeries(
+                pd_series.map(lambda wkb: shapely.wkb.loads(bytes(wkb)))
+            )
+        except Exception as e:
+            return gpd.GeoSeries(pd_series)
+
+    def to_spark_pandas(self) -> pspd.Series:
+        return pspd.Series(self._to_internal_pandas())
 
     @property
     def geometry(self) -> "GeoSeries":
@@ -274,7 +281,7 @@ class GeoSeries(GeoFrame, pspd.Series):
             return self
 
     @property
-    def area(self) -> "GeoSeries":
+    def area(self) -> pspd.Series:
         """
         Returns a Series containing the area of each geometry in the GeoSeries 
expressed in the units of the CRS.
 
@@ -295,7 +302,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         1    4.0
         dtype: float64
         """
-        return self._process_geometry_column("ST_Area", rename="area")
+        return self._process_geometry_column("ST_Area", 
rename="area").to_spark_pandas()
 
     @property
     def crs(self):
@@ -521,7 +528,7 @@ class GeoSeries(GeoFrame, pspd.Series):
         mitre_limit=5.0,
         single_sided=False,
         **kwargs,
-    ):
+    ) -> "GeoSeries":
         """
         Returns a GeoSeries of geometries representing all points within a 
given distance of each geometric object.
 
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index 3f560262ea..bec526ad46 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -14,109 +14,55 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import os
-import shutil
-import tempfile
-from geopandas.testing import assert_geoseries_equal
 
-from shapely.geometry import (
-    Point,
-    Polygon,
-)
-
-from sedona.geopandas import GeoSeries
+import pandas as pd
+import geopandas as gpd
+import sedona.geopandas as sgpd
 from tests.test_base import TestBase
-import pyspark.pandas as ps
+from shapely import wkt
+from shapely.geometry import Point, LineString, Polygon, GeometryCollection
+from pandas.testing import assert_series_equal
 
 
-class TestSeries(TestBase):
+class TestGeoSeries(TestBase):
     def setup_method(self):
-        self.tempdir = tempfile.mkdtemp()
-        self.t1 = Polygon([(0, 0), (1, 0), (1, 1)])
-        self.t2 = Polygon([(0, 0), (1, 1), (0, 1)])
-        self.sq = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
-        self.g1 = GeoSeries([self.t1, self.t2])
-        self.g2 = GeoSeries([self.sq, self.t1])
-        self.g3 = GeoSeries([self.t1, self.t2], crs="epsg:4326")
-        self.g4 = GeoSeries([self.t2, self.t1])
-
-    def teardown_method(self):
-        shutil.rmtree(self.tempdir)
-
-    def test_constructor(self):
-        s = GeoSeries([Point(x, x) for x in range(3)])
-        check_geoseries_equal(s, s)
-
-    def test_psdf(self):
-        # this is to make sure the spark session works with pandas on spark api
-        psdf = ps.DataFrame(
-            {
-                "a": [1, 2, 3, 4, 5, 6],
-                "b": [100, 200, 300, 400, 500, 600],
-                "c": ["one", "two", "three", "four", "five", "six"],
-            },
-            index=[10, 20, 30, 40, 50, 60],
+        self.geoseries = sgpd.GeoSeries(
+            [
+                Point(2.3, -1),
+                LineString([(0.5, 0), (0, -3)]),
+                Polygon([(-1, -1), (-0.3, 5), (1, 1.2)]),
+                GeometryCollection(
+                    [
+                        Point(2.3, -1),
+                        LineString([(0.5, 0), (0, -3)]),
+                        Polygon([(-1, -1), (-0.3, 5), (1, 1.2)]),
+                    ]
+                ),
+            ]
         )
-        assert psdf.count().count() == 3
-
-    def test_internal_st_function(self):
-        # this is to make sure the spark session works with internal sedona 
udfs
-        baseDf = self.spark.sql(
-            "SELECT ST_GeomFromWKT('POLYGON ((50 50 1, 50 80 2, 80 80 3, 80 50 
2, 50 50 1))') as geom"
-        )
-        actual = baseDf.selectExpr("ST_AsText(ST_Expand(geom, 10))").first()[0]
-        expected = "POLYGON Z((40 40 -9, 40 90 -9, 90 90 13, 90 40 13, 40 40 
-9))"
-        assert expected == actual
-
-    def test_type(self):
-        assert type(self.g1) is GeoSeries
-        assert type(self.g2) is GeoSeries
-        assert type(self.g3) is GeoSeries
-        assert type(self.g4) is GeoSeries
 
-    def test_copy(self):
-        gc = self.g3.copy()
-        assert type(gc) is GeoSeries
-        assert self.g3.name == gc.name
+    def check_sgpd_equals_gpd(self, actual: sgpd.GeoSeries, expected: 
gpd.GeoSeries):
+        assert isinstance(actual, sgpd.GeoSeries)
+        assert isinstance(expected, gpd.GeoSeries)
+        assert len(actual) == len(expected)
+        sgpd_result = actual.to_geopandas()
+        for a, e in zip(sgpd_result, expected):
+            self.assert_geometry_almost_equal(a, e)
 
     def test_area(self):
-        area = self.g1.area
-        assert area is not None
-        assert type(area) is GeoSeries
-        assert area.count() == 2
+        result = self.geoseries.area.to_pandas()
+        expected = pd.Series([0.0, 0.0, 5.23, 5.23])
+        assert result.count() > 0
+        assert_series_equal(result, expected)
 
     def test_buffer(self):
-        buffer = self.g1.buffer(0.2)
-        assert buffer is not None
-        assert type(buffer) is GeoSeries
-        assert buffer.count() == 2
-
-    def test_buffer_then_area(self):
-        area = self.g1.buffer(0.2).area
-        assert area is not None
-        assert type(area) is GeoSeries
-        assert area.count() == 2
-
-    def test_buffer_then_geoparquet(self):
-        temp_file_path = os.path.join(
-            self.tempdir, next(tempfile._get_candidate_names()) + ".parquet"
-        )
-        self.g1.buffer(0.2).to_parquet(temp_file_path)
-        assert os.path.exists(temp_file_path)
-
-
-# -----------------------------------------------------------------------------
-# # Utils
-# -----------------------------------------------------------------------------
-
-
-def check_geoseries_equal(s1, s2):
-    assert isinstance(s1, GeoSeries)
-    assert isinstance(s1.geometry, GeoSeries)
-    assert isinstance(s2, GeoSeries)
-    assert isinstance(s2.geometry, GeoSeries)
-    if isinstance(s1, GeoSeries):
-        s1 = s1.to_geopandas()
-    if isinstance(s2, GeoSeries):
-        s2 = s2.to_geopandas()
-    assert_geoseries_equal(s1, s2)
+        result = self.geoseries.buffer(1)
+        expected = [
+            "POLYGON ((3.300000000000000 -1.000000000000000, 3.280785280403230 
-1.195090322016128, 3.223879532511287 -1.382683432365090, 3.131469612302545 
-1.555570233019602, 3.007106781186547 -1.707106781186547, 2.855570233019602 
-1.831469612302545, 2.682683432365089 -1.923879532511287, 2.495090322016128 
-1.980785280403230, 2.300000000000000 -2.000000000000000, 2.104909677983872 
-1.980785280403230, 1.917316567634910 -1.923879532511287, 1.744429766980398 
-1.831469612302545, 1.59289321881 [...]
+            "POLYGON ((0.986393923832144 -3.164398987305357, 0.935367989801224 
-3.353676015097457, 0.848396388482656 -3.529361471973156, 0.728821389740875 
-3.684703864350261, 0.581238193719096 -3.813733471206735, 0.411318339874827 
-3.911491757111723, 0.225591752899151 -3.974221925961374, 0.031195801372873 
-3.999513292546280, -0.164398987305357 -3.986393923832144, -0.353676015097457 
-3.935367989801224, -0.529361471973156 -3.848396388482656, -0.684703864350260 
-3.728821389740875, -0.813733 [...]
+            "POLYGON ((-0.260059926604056 -1.672672793996312, 
-0.403493516968407 -1.802608257932399, -0.569270104475049 -1.902480890158382, 
-0.751180291696993 -1.968549819451744, -0.942410374326119 -1.998340340272165, 
-1.135797558140999 -1.990736606370705, -1.324098251632999 -1.946023426395157, 
-1.500259385009482 -1.865875595977814, -1.657682592935656 -1.753295165887471, 
-1.790471365675451 -1.612498995956065, -1.893651911234561 -1.448760806607280, 
-1.963359455800552 -1.268213644171327, - [...]
+            "POLYGON ((-0.844303230213814 -1.983056850984667, 
-0.942410374326119 -1.998340340272165, -1.135797558140999 -1.990736606370705, 
-1.324098251632999 -1.946023426395157, -1.500259385009482 -1.865875595977814, 
-1.657682592935656 -1.753295165887471, -1.790471365675451 -1.612498995956065, 
-1.893651911234561 -1.448760806607280, -1.963359455800552 -1.268213644171327, 
-1.996983004332570 -1.077620158927971, -1.993263139087243 -0.884119300439822, 
-1.293263139087243 5.115880699560178, -1 [...]
+        ]
+        expected = gpd.GeoSeries([wkt.loads(wkt_str) for wkt_str in expected])
+        assert result.count() > 0
+        self.check_sgpd_equals_gpd(result, expected)
diff --git a/python/tests/geopandas/test_match_geopandas_series.py 
b/python/tests/geopandas/test_match_geopandas_series.py
new file mode 100644
index 0000000000..3b3eccc4d4
--- /dev/null
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import shutil
+import tempfile
+import pytest
+import pandas as pd
+import geopandas as gpd
+import pyspark.pandas as ps
+import pyspark
+from pandas.testing import assert_series_equal
+
+from shapely.geometry import (
+    Point,
+    Polygon,
+    MultiPoint,
+    MultiLineString,
+    LineString,
+    MultiPolygon,
+    GeometryCollection,
+)
+
+from sedona.geopandas import GeoSeries
+from tests.test_base import TestBase
+import pyspark.pandas as ps
+
+
+class TestMatchGeopandasSeries(TestBase):
+    def setup_method(self):
+        self.tempdir = tempfile.mkdtemp()
+        self.t1 = Polygon([(0, 0), (1, 0), (1, 1)])
+        self.t2 = Polygon([(0, 0), (1, 1), (0, 1)])
+        self.sq = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
+        self.g1 = GeoSeries([self.t1, self.t2])
+        self.g2 = GeoSeries([self.sq, self.t1])
+        self.g3 = GeoSeries([self.t1, self.t2], crs="epsg:4326")
+        self.g4 = GeoSeries([self.t2, self.t1])
+
+        self.points = [Point(x, x + 1) for x in range(3)]
+
+        self.multipoints = [MultiPoint([(x, x + 1), (x + 2, x + 3)]) for x in 
range(3)]
+
+        self.linestrings = [LineString([(x, x + 1), (x + 2, x + 3)]) for x in 
range(3)]
+
+        self.multilinestrings = [
+            MultiLineString(
+                [[[x, x + 1], [x + 2, x + 3]], [[x + 4, x + 5], [x + 6, x + 
7]]]
+            )
+            for x in range(3)
+        ]
+
+        self.polygons = [
+            Polygon([(x, 0), (x + 1, 0), (x + 2, 1), (x + 3, 1)]) for x in 
range(3)
+        ]
+
+        self.multipolygons = [
+            MultiPolygon(
+                [
+                    (
+                        [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+                        [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1, 0.1)]],
+                    )
+                ]
+            )
+        ]
+
+        self.geomcollection = [
+            GeometryCollection(
+                [
+                    MultiPoint([(0, 0), (1, 1)]),
+                    MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
+                    MultiPolygon(
+                        [
+                            (
+                                [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0)],
+                                [[(0.1, 0.1), (0.1, 0.2), (0.2, 0.1), (0.1, 
0.1)]],
+                            )
+                        ]
+                    ),
+                ]
+            )
+        ]
+
+        # (sql_table_name, geom)
+        self.geoms = [
+            ("points", self.points),
+            ("multipoints", self.multipoints),
+            ("linestrings", self.linestrings),
+            ("multilinestrings", self.multilinestrings),
+            ("polygons", self.polygons),
+            ("multipolygons", self.multipolygons),
+            ("geomcollection", self.geomcollection),
+        ]
+
+        # create the tables in sedona spark
+        for i, (table_name, geoms) in enumerate(self.geoms):
+            wkt_string = [g.wkt for g in geoms]
+            pd_df = pd.DataFrame({"id": i, "geometry": wkt_string})
+            spark_df = self.spark.createDataFrame(pd_df)
+            spark_df.createOrReplaceTempView(table_name)
+
+    def teardown_method(self):
+        shutil.rmtree(self.tempdir)
+
+    def test_constructor(self):
+        for _, geom in self.geoms:
+            gpd_series = gpd.GeoSeries(geom)
+            assert isinstance(gpd_series, gpd.GeoSeries)
+            assert isinstance(gpd_series.geometry, gpd.GeoSeries)
+
+    def test_non_geom_fails(self):
+        with pytest.raises(TypeError):
+            GeoSeries([0, 1, 2])
+        with pytest.raises(TypeError):
+            GeoSeries([0, 1, 2], crs="epsg:4326")
+        with pytest.raises(TypeError):
+            GeoSeries(["a", "b", "c"])
+
+    def test_to_geopandas(self):
+        for _, geom in self.geoms:
+            sgpd_result = GeoSeries(geom)
+            gpd_result = gpd.GeoSeries(geom)
+            self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
+    def test_psdf(self):
+        # this is to make sure the spark session works with pandas on spark api
+        psdf = ps.DataFrame(
+            {
+                "a": [1, 2, 3, 4, 5, 6],
+                "b": [100, 200, 300, 400, 500, 600],
+                "c": ["one", "two", "three", "four", "five", "six"],
+            },
+            index=[10, 20, 30, 40, 50, 60],
+        )
+        assert psdf.count().count() == 3
+
+    def test_internal_st_function(self):
+        # this is to make sure the spark session works with internal sedona 
udfs
+        baseDf = self.spark.sql(
+            "SELECT ST_GeomFromWKT('POLYGON ((50 50 1, 50 80 2, 80 80 3, 80 50 
2, 50 50 1))') as geom"
+        )
+        actual = baseDf.selectExpr("ST_AsText(ST_Expand(geom, 10))").first()[0]
+        expected = "POLYGON Z((40 40 -9, 40 90 -9, 90 90 13, 90 40 13, 40 40 
-9))"
+        assert expected == actual
+
+    def test_type(self):
+        assert type(self.g1) is GeoSeries
+        assert type(self.g2) is GeoSeries
+        assert type(self.g3) is GeoSeries
+        assert type(self.g4) is GeoSeries
+
+    def test_copy(self):
+        gc = self.g3.copy()
+        assert type(gc) is GeoSeries
+        assert self.g3.name == gc.name
+
+    def test_area(self):
+        area = self.g1.area
+        assert area is not None
+        assert type(area) is ps.Series
+        assert area.count() == 2
+
+        for _, geom in self.geoms:
+            sgpd_result = GeoSeries(geom).area
+            gpd_result = gpd.GeoSeries(geom).area
+            self.check_pd_series_equal(sgpd_result, gpd_result)
+
+    def test_buffer(self):
+        buffer = self.g1.buffer(0.2)
+        assert buffer is not None
+        assert type(buffer) is GeoSeries
+        assert buffer.count() == 2
+
+        for _, geom in self.geoms:
+            dist = 0.2
+            sgpd_result = GeoSeries(geom).buffer(dist)
+            gpd_result = gpd.GeoSeries(geom).buffer(dist)
+
+            self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+
+    def test_buffer_then_area(self):
+        area = self.g1.buffer(0.2).area
+        assert area is not None
+        assert type(area) is ps.Series
+        assert area.count() == 2
+
+    def test_buffer_then_geoparquet(self):
+        temp_file_path = os.path.join(
+            self.tempdir, next(tempfile._get_candidate_names()) + ".parquet"
+        )
+        self.g1.buffer(0.2).to_parquet(temp_file_path)
+        assert os.path.exists(temp_file_path)
+
+    # 
-----------------------------------------------------------------------------
+    # # Utils
+    # 
-----------------------------------------------------------------------------
+
+    def check_sgpd_equals_spark_df(
+        self, actual: GeoSeries, expected: pyspark.sql.DataFrame
+    ):
+        assert isinstance(actual, GeoSeries)
+        assert isinstance(expected, pyspark.sql.DataFrame)
+        expected = expected.selectExpr("ST_AsText(expected) as expected")
+        sgpd_result = actual.to_geopandas()
+        expected = expected.toPandas()["expected"]
+        for a, e in zip(sgpd_result, expected):
+            self.assert_geometry_almost_equal(a, e)
+
+    def check_sgpd_equals_gpd(self, actual: GeoSeries, expected: 
gpd.GeoSeries):
+        assert isinstance(actual, GeoSeries)
+        assert isinstance(expected, gpd.GeoSeries)
+        sgpd_result = actual.to_geopandas()
+        for a, e in zip(sgpd_result, expected):
+            self.assert_geometry_almost_equal(
+                a, e, tolerance=1e-2
+            )  # increased tolerance from 1e-6
+
+    def check_pd_series_equal(self, actual: ps.Series, expected: pd.Series):
+        assert isinstance(actual, ps.Series)
+        assert isinstance(expected, pd.Series)
+        assert_series_equal(actual.to_pandas(), expected)

Reply via email to