This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new c18dddac87 [GH-2039] Geopandas.GeoSeries: Implement from_wkb,
from_wkt, from_xy + temp __init__ fix (#2040)
c18dddac87 is described below
commit c18dddac87c1d3a781ce791ed2aee8d8f6779f00
Author: Peter Nguyen <[email protected]>
AuthorDate: Thu Jul 3 21:41:57 2025 -0700
[GH-2039] Geopandas.GeoSeries: Implement from_wkb, from_wkt, from_xy + temp
__init__ fix (#2040)
---
python/sedona/geopandas/geoseries.py | 275 ++++++++++++++++++++-
python/tests/geopandas/test_geoseries.py | 40 ++-
.../tests/geopandas/test_match_geopandas_series.py | 32 ++-
3 files changed, 335 insertions(+), 12 deletions(-)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index 6adb2f1529..6f68c3c056 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -123,9 +123,6 @@ class GeoSeries(GeoFrame, pspd.Series):
assert not copy
assert not fastpath
- self._anchor = data
- self._col_label = index
-
data_crs = None
if hasattr(data, "crs"):
data_crs = data.crs
@@ -136,6 +133,20 @@ class GeoSeries(GeoFrame, pspd.Series):
"allow_override=True)' to overwrite CRS or "
"'GeoSeries.to_crs(crs)' to reproject geometries. "
)
+ # This is a temporary workaround since pyspark errors when
creating a ps.Series from a ps.Series
+ # This is NOT a scalable solution since we call to_pandas() on the
data and is a hacky solution
+ # but this should be resolved if/once
https://github.com/apache/spark/pull/51300 is merged in.
+ # For now, we reset self._anchor = data to have keep the geometry
information (e.g crs) that's lost in to_pandas()
+ super().__init__(
+ data=data.to_pandas(),
+ index=index,
+ dtype=dtype,
+ name=name,
+ copy=copy,
+ fastpath=fastpath,
+ )
+
+ self._anchor = data
else:
if isinstance(data, pd.Series):
assert index is None
@@ -1360,7 +1371,82 @@ class GeoSeries(GeoFrame, pspd.Series):
on_invalid="raise",
**kwargs,
) -> "GeoSeries":
- raise NotImplementedError("GeoSeries.from_wkb() is not implemented
yet.")
+ r"""
+ Alternate constructor to create a ``GeoSeries``
+ from a list or array of WKB objects
+
+ Parameters
+ ----------
+ data : array-like or Series
+ Series, list or array of WKB objects
+ index : array-like or Index
+ The index for the GeoSeries.
+ crs : value, optional
+ Coordinate Reference System of the geometry objects. Can be
anything
+ accepted by
+ :meth:`pyproj.CRS.from_user_input()
<pyproj.crs.CRS.from_user_input>`,
+ such as an authority string (eg "EPSG:4326") or a WKT string.
+ on_invalid: {"raise", "warn", "ignore"}, default "raise"
+ - raise: an exception will be raised if a WKB input geometry is
invalid.
+ - warn: a warning will be raised and invalid WKB geometries will
be returned
+ as None.
+ - ignore: invalid WKB geometries will be returned as None without
a warning.
+ - fix: an effort is made to fix invalid input geometries (e.g.
close
+ unclosed rings). If this is not possible, they are returned as
``None``
+ without a warning. Requires GEOS >= 3.11 and shapely >= 2.1.
+
+ kwargs
+ Additional arguments passed to the Series constructor,
+ e.g. ``name``.
+
+ Returns
+ -------
+ GeoSeries
+
+ See Also
+ --------
+ GeoSeries.from_wkt
+
+ Examples
+ --------
+
+ >>> wkbs = [
+ ... (
+ ... b"\x01\x01\x00\x00\x00\x00\x00\x00\x00"
+ ... b"\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?"
+ ... ),
+ ... (
+ ... b"\x01\x01\x00\x00\x00\x00\x00\x00\x00"
+ ... b"\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@"
+ ... ),
+ ... (
+ ... b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00"
+ ... b"\x00\x08@\x00\x00\x00\x00\x00\x00\x08@"
+ ... ),
+ ... ]
+ >>> s = geopandas.GeoSeries.from_wkb(wkbs)
+ >>> s
+ 0 POINT (1 1)
+ 1 POINT (2 2)
+ 2 POINT (3 3)
+ dtype: geometry
+ """
+ if on_invalid != "raise":
+ raise NotImplementedError(
+ "GeoSeries.from_wkb(): only on_invalid='raise' is implemented"
+ )
+
+ from pyspark.sql.types import StructType, StructField, BinaryType
+
+ schema = StructType([StructField("data", BinaryType(), True)])
+ return cls._create_from_select(
+ f"ST_GeomFromWKB(`data`)",
+ data,
+ schema,
+ index,
+ crs,
+ **kwargs,
+ )
@classmethod
def from_wkt(
@@ -1371,11 +1457,154 @@ class GeoSeries(GeoFrame, pspd.Series):
on_invalid="raise",
**kwargs,
) -> "GeoSeries":
- raise NotImplementedError("GeoSeries.from_wkt() is not implemented
yet.")
+ """
+ Alternate constructor to create a ``GeoSeries``
+ from a list or array of WKT objects
+
+ Parameters
+ ----------
+ data : array-like, Series
+ Series, list, or array of WKT objects
+ index : array-like or Index
+ The index for the GeoSeries.
+ crs : value, optional
+ Coordinate Reference System of the geometry objects. Can be
anything
+ accepted by
+ :meth:`pyproj.CRS.from_user_input()
<pyproj.crs.CRS.from_user_input>`,
+ such as an authority string (eg "EPSG:4326") or a WKT string.
+ on_invalid : {"raise", "warn", "ignore"}, default "raise"
+ - raise: an exception will be raised if a WKT input geometry is
invalid.
+ - warn: a warning will be raised and invalid WKT geometries will be
+ returned as ``None``.
+ - ignore: invalid WKT geometries will be returned as ``None``
without a
+ warning.
+ - fix: an effort is made to fix invalid input geometries (e.g.
close
+ unclosed rings). If this is not possible, they are returned as
``None``
+ without a warning. Requires GEOS >= 3.11 and shapely >= 2.1.
+
+ kwargs
+ Additional arguments passed to the Series constructor,
+ e.g. ``name``.
+
+ Returns
+ -------
+ GeoSeries
+
+ See Also
+ --------
+ GeoSeries.from_wkb
+
+ Examples
+ --------
+
+ >>> wkts = [
+ ... 'POINT (1 1)',
+ ... 'POINT (2 2)',
+ ... 'POINT (3 3)',
+ ... ]
+ >>> s = geopandas.GeoSeries.from_wkt(wkts)
+ >>> s
+ 0 POINT (1 1)
+ 1 POINT (2 2)
+ 2 POINT (3 3)
+ dtype: geometry
+ """
+ if on_invalid != "raise":
+ raise NotImplementedError(
+ "GeoSeries.from_wkt(): only on_invalid='raise' is implemented"
+ )
+
+ from pyspark.sql.types import StructType, StructField, StringType
+
+ schema = StructType([StructField("data", StringType(), True)])
+ return cls._create_from_select(
+ f"ST_GeomFromText(`data`)",
+ data,
+ schema,
+ index,
+ crs,
+ **kwargs,
+ )
@classmethod
def from_xy(cls, x, y, z=None, index=None, crs=None, **kwargs) ->
"GeoSeries":
- raise NotImplementedError("GeoSeries.from_xy() is not implemented
yet.")
+ """
+ Alternate constructor to create a :class:`~geopandas.GeoSeries` of
Point
+ geometries from lists or arrays of x, y(, z) coordinates
+
+ In case of geographic coordinates, it is assumed that longitude is
captured
+ by ``x`` coordinates and latitude by ``y``.
+
+ Parameters
+ ----------
+ x, y, z : iterable
+ index : array-like or Index, optional
+ The index for the GeoSeries. If not given and all coordinate inputs
+ are Series with an equal index, that index is used.
+ crs : value, optional
+ Coordinate Reference System of the geometry objects. Can be
anything
+ accepted by
+ :meth:`pyproj.CRS.from_user_input()
<pyproj.crs.CRS.from_user_input>`,
+ such as an authority string (eg "EPSG:4326") or a WKT string.
+ **kwargs
+ Additional arguments passed to the Series constructor,
+ e.g. ``name``.
+
+ Returns
+ -------
+ GeoSeries
+
+ See Also
+ --------
+ GeoSeries.from_wkt
+ points_from_xy
+
+ Examples
+ --------
+
+ >>> x = [2.5, 5, -3.0]
+ >>> y = [0.5, 1, 1.5]
+ >>> s = geopandas.GeoSeries.from_xy(x, y, crs="EPSG:4326")
+ >>> s
+ 0 POINT (2.5 0.5)
+ 1 POINT (5 1)
+ 2 POINT (-3 1.5)
+ dtype: geometry
+ """
+ from pyspark.sql.types import StructType, StructField, DoubleType
+
+ schema = StructType(
+ [StructField("x", DoubleType(), True), StructField("y",
DoubleType(), True)]
+ )
+
+ # Spark doesn't automatically cast ints to floats for us
+ x = [float(num) for num in x]
+ y = [float(num) for num in y]
+ z = [float(num) for num in z] if z else None
+
+ if z:
+ data = list(zip(x, y, z))
+ select = f"ST_PointZ(`x`, `y`, `z`)"
+ schema.add(StructField("z", DoubleType(), True))
+ else:
+ data = list(zip(x, y))
+ select = f"ST_Point(`x`, `y`)"
+
+ geoseries = cls._create_from_select(
+ select,
+ data,
+ schema,
+ index,
+ crs,
+ **kwargs,
+ )
+
+ if crs:
+ from pyproj import CRS
+
+ geoseries.crs = CRS.from_user_input(crs).to_epsg()
+
+ return geoseries
@classmethod
def from_shapely(
@@ -1387,6 +1616,40 @@ class GeoSeries(GeoFrame, pspd.Series):
def from_arrow(cls, arr, **kwargs) -> "GeoSeries":
raise NotImplementedError("GeoSeries.from_arrow() is not implemented
yet.")
+ @classmethod
+ def _create_from_select(
+ cls, select: str, data, schema, index, crs, **kwargs
+ ) -> "GeoSeries":
+
+ from pyspark.pandas.utils import default_session
+ from pyspark.pandas.internal import InternalField
+ import numpy as np
+
+ if isinstance(data, list) and not isinstance(data[0], (tuple, list)):
+ data = [(obj,) for obj in data]
+
+ select = f"{select} as geometry"
+
+ spark_df = default_session().createDataFrame(data, schema=schema)
+ spark_df = spark_df.selectExpr(select)
+
+ internal = InternalFrame(
+ spark_frame=spark_df,
+ index_spark_columns=None,
+ column_labels=[("geometry",)],
+ data_spark_columns=[scol_for(spark_df, "geometry")],
+ data_fields=[
+ InternalField(np.dtype("object"), spark_df.schema["geometry"])
+ ],
+ column_label_names=[("geometry",)],
+ )
+ return GeoSeries(
+ first_series(PandasOnSparkDataFrame(internal)),
+ index,
+ crs=crs,
+ name=kwargs.get("name", None),
+ )
+
def to_file(
self,
filename: Union[os.PathLike, typing.IO],
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 3c7951e941..78cb66812e 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -119,13 +119,47 @@ class TestGeoSeries(TestBase):
pass
def test_from_wkb(self):
- pass
+ wkbs = [
+ (
+ b"\x01\x01\x00\x00\x00\x00\x00\x00\x00"
+ b"\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?"
+ ),
+ (
+ b"\x01\x01\x00\x00\x00\x00\x00\x00\x00"
+ b"\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00@"
+ ),
+ (
+ b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00"
+ b"\x00\x08@\x00\x00\x00\x00\x00\x00\x08@"
+ ),
+ ]
+ s = sgpd.GeoSeries.from_wkb(wkbs)
+ expected = gpd.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)])
+ self.check_sgpd_equals_gpd(s, expected)
def test_from_wkt(self):
- pass
+ wkts = [
+ "POINT (1 1)",
+ "POINT (2 2)",
+ "POINT (3 3)",
+ ]
+ s = sgpd.GeoSeries.from_wkt(wkts)
+ expected = gpd.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)])
+ self.check_sgpd_equals_gpd(s, expected)
def test_from_xy(self):
- pass
+ x = [2.5, 5, -3.0]
+ y = [0.5, 1, 1.5]
+ s = sgpd.GeoSeries.from_xy(x, y, crs="EPSG:4326")
+ expected = gpd.GeoSeries([Point(2.5, 0.5), Point(5, 1), Point(-3,
1.5)])
+ self.check_sgpd_equals_gpd(s, expected)
+
+ z = [1, 2, 3]
+ s = sgpd.GeoSeries.from_xy(x, y, z)
+ expected = gpd.GeoSeries(
+ [Point(2.5, 0.5, 1), Point(5, 1, 2), Point(-3, 1.5, 3)]
+ )
+ self.check_sgpd_equals_gpd(s, expected)
def test_from_shapely(self):
pass
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
index 4a4a3495ca..00ad6687ee 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -241,13 +241,39 @@ class TestMatchGeopandasSeries(TestBase):
pass
def test_from_wkb(self):
- pass
+ for _, geom in self.geoms:
+ wkb = [g.wkb for g in geom]
+ sgpd_result = GeoSeries.from_wkb(wkb)
+ gpd_result = gpd.GeoSeries.from_wkb(wkb)
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
def test_from_wkt(self):
- pass
+ for _, geom in self.geoms:
+ wkt = [g.wkt for g in geom]
+ sgpd_result = GeoSeries.from_wkt(wkt)
+ gpd_result = gpd.GeoSeries.from_wkt(wkt)
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
def test_from_xy(self):
- pass
+ tests = [
+ [
+ [2.5, 0.5, 5.0, -2], # x
+ [5, 10, 0, 1], # y
+ [-3, 1.5, -1000, 25], # z
+ "EPSG:4326",
+ ],
+ [
+ [2.5, -0.5, 1, 500], # x
+ [5, 1, -100, 1000], # y
+ None,
+ None,
+ ],
+ ]
+ for x, y, z, crs in tests:
+ sgpd_result = GeoSeries.from_xy(x, y, z, crs=crs)
+ gpd_result = gpd.GeoSeries.from_xy(x, y, z, crs=crs)
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+ assert sgpd_result.crs == gpd_result.crs
def test_from_shapely(self):
pass