This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new f4719ed2a2 [GH-2192] Geopandas: Implement `plot()` for GeoDataFrame
and GeoSeries + `read_parquet` (#2193)
f4719ed2a2 is described below
commit f4719ed2a2382675b843b7d12a29764c2cfe3b80
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Jul 29 20:04:35 2025 -0700
[GH-2192] Geopandas: Implement `plot()` for GeoDataFrame and GeoSeries +
`read_parquet` (#2193)
* Copy original test_plotting into sedona's repo
* Fix up test_plotting to work
* Implement plot() in geoseries and geodataframe
* Remove gpd compat import
* Just set GEOS_GE_390=True in test_plotting
* skip tests for shapely < 2.0
* Skip tests if gpd < 1.0
* Add read_parquet
* Skip plotting tests that fail on older versions
* Add warning for unsupported params in read_parquet
* Remove test_plotting.py
* Add test_plot in test_geodataframe and test_series
---
python/sedona/geopandas/__init__.py | 2 +-
python/sedona/geopandas/base.py | 10 ++-
python/sedona/geopandas/geodataframe.py | 131 ++++++++++++++++++++++++++++
python/sedona/geopandas/geoseries.py | 50 +++++++++++
python/sedona/geopandas/io.py | 50 +++++++++++
python/tests/geopandas/test_geodataframe.py | 17 +++-
python/tests/geopandas/test_geoseries.py | 4 +
python/tests/geopandas/test_io.py | 3 +-
8 files changed, 262 insertions(+), 5 deletions(-)
diff --git a/python/sedona/geopandas/__init__.py
b/python/sedona/geopandas/__init__.py
index 855f27d591..989f268c31 100644
--- a/python/sedona/geopandas/__init__.py
+++ b/python/sedona/geopandas/__init__.py
@@ -25,4 +25,4 @@ from sedona.geopandas.geodataframe import GeoDataFrame
from sedona.geopandas.tools import sjoin
-from sedona.geopandas.io import read_file
+from sedona.geopandas.io import read_file, read_parquet
diff --git a/python/sedona/geopandas/base.py b/python/sedona/geopandas/base.py
index 3c874816b7..d7837bec8e 100644
--- a/python/sedona/geopandas/base.py
+++ b/python/sedona/geopandas/base.py
@@ -2288,8 +2288,14 @@ class GeoFrame(metaclass=ABCMeta):
"simplify", self, tolerance, preserve_topology
)
- def sjoin(self, other, predicate="intersects", **kwargs):
- raise NotImplementedError("This method is not implemented yet.")
+ @abstractmethod
+ def to_geopandas(self) -> Union[gpd.GeoSeries, gpd.GeoDataFrame]: ...
+
+ @abstractmethod
+ def plot(self, *args, **kwargs): ...
+
+ @abstractmethod
+ def sjoin(self, other, predicate="intersects", **kwargs): ...
def _delegate_to_geometry_column(op, this, *args, **kwargs):
diff --git a/python/sedona/geopandas/geodataframe.py
b/python/sedona/geopandas/geodataframe.py
index d3d0d21ce1..b2c3060c04 100644
--- a/python/sedona/geopandas/geodataframe.py
+++ b/python/sedona/geopandas/geodataframe.py
@@ -1184,6 +1184,137 @@ class GeoDataFrame(GeoFrame, pspd.DataFrame):
_not_implemented_error("type", "Returns numeric geometry type
codes.")
)
+ def plot(self, *args, **kwargs):
+ """
+ Plot a GeoDataFrame.
+
+ Generate a plot of a GeoDataFrame with matplotlib. If a
+ column is specified, the plot coloring will be based on values
+ in that column.
+
+ Note: This method is not scalable and requires collecting all data to
the driver.
+
+ Parameters
+ ----------
+ column : str, np.array, pd.Series, pd.Index (default None)
+ The name of the dataframe column, np.array, pd.Series, or pd.Index
+ to be plotted. If np.array, pd.Series, or pd.Index are used then it
+ must have same length as dataframe. Values are used to color the
plot.
+ Ignored if `color` is also set.
+ kind: str
+ The kind of plots to produce. The default is to create a map
("geo").
+ Other supported kinds of plots from pandas:
+
+ - 'line' : line plot
+ - 'bar' : vertical bar plot
+ - 'barh' : horizontal bar plot
+ - 'hist' : histogram
+ - 'box' : BoxPlot
+ - 'kde' : Kernel Density Estimation plot
+ - 'density' : same as 'kde'
+ - 'area' : area plot
+ - 'pie' : pie plot
+ - 'scatter' : scatter plot
+ - 'hexbin' : hexbin plot.
+ cmap : str (default None)
+ The name of a colormap recognized by matplotlib.
+ color : str, np.array, pd.Series (default None)
+ If specified, all objects will be colored uniformly.
+ ax : matplotlib.pyplot.Artist (default None)
+ axes on which to draw the plot
+ cax : matplotlib.pyplot Artist (default None)
+ axes on which to draw the legend in case of color map.
+ categorical : bool (default False)
+ If False, cmap will reflect numerical values of the
+ column being plotted. For non-numerical columns, this
+ will be set to True.
+ legend : bool (default False)
+ Plot a legend. Ignored if no `column` is given, or if `color` is
given.
+ scheme : str (default None)
+ Name of a choropleth classification scheme (requires mapclassify).
+ A mapclassify.MapClassifier object will be used
+ under the hood. Supported are all schemes provided by mapclassify
(e.g.
+ 'BoxPlot', 'EqualInterval', 'FisherJenks', 'FisherJenksSampled',
+ 'HeadTailBreaks', 'JenksCaspall', 'JenksCaspallForced',
+ 'JenksCaspallSampled', 'MaxP', 'MaximumBreaks',
+ 'NaturalBreaks', 'Quantiles', 'Percentiles', 'StdMean',
+ 'UserDefined'). Arguments can be passed in classification_kwds.
+ k : int (default 5)
+ Number of classes (ignored if scheme is None)
+ vmin : None or float (default None)
+ Minimum value of cmap. If None, the minimum data value
+ in the column to be plotted is used.
+ vmax : None or float (default None)
+ Maximum value of cmap. If None, the maximum data value
+ in the column to be plotted is used.
+ markersize : str or float or sequence (default None)
+ Only applies to point geometries within a frame.
+ If a str, will use the values in the column of the frame specified
+ by markersize to set the size of markers. Otherwise can be a value
+ to apply to all points, or a sequence of the same length as the
+ number of points.
+ figsize : tuple of integers (default None)
+ Size of the resulting matplotlib.figure.Figure. If the argument
+ axes is given explicitly, figsize is ignored.
+ legend_kwds : dict (default None)
+ Keyword arguments to pass to :func:`matplotlib.pyplot.legend` or
+ :func:`matplotlib.pyplot.colorbar`.
+ Additional accepted keywords when `scheme` is specified:
+
+ fmt : string
+ A formatting specification for the bin edges of the classes in
the
+ legend. For example, to have no decimals: ``{"fmt":
"{:.0f}"}``.
+ labels : list-like
+ A list of legend labels to override the auto-generated labels.
+ Needs to have the same number of elements as the number of
+ classes (`k`).
+ interval : boolean (default False)
+ An option to control brackets from mapclassify legend.
+ If True, open/closed interval brackets are shown in the legend.
+ categories : list-like
+ Ordered list-like object of categories to be used for categorical
plot.
+ classification_kwds : dict (default None)
+ Keyword arguments to pass to mapclassify
+ missing_kwds : dict (default None)
+ Keyword arguments specifying color options (as style_kwds)
+ to be passed on to geometries with missing values in addition to
+ or overwriting other style kwds. If None, geometries with missing
+ values are not plotted.
+ aspect : 'auto', 'equal', None or float (default 'auto')
+ Set aspect of axis. If 'auto', the default aspect for map plots is
'equal'; if
+ however data are not projected (coordinates are long/lat), the
aspect is by
+ default set to 1/cos(df_y * pi/180) with df_y the y coordinate of
the middle of
+ the GeoDataFrame (the mean of the y range of bounding box) so that
a long/lat
+ square appears square in the middle of the plot. This implies an
+ Equirectangular projection. If None, the aspect of `ax` won't be
changed. It can
+ also be set manually (float) as the ratio of y-unit to x-unit.
+ autolim : bool (default True)
+ Update axes data limits to contain the new geometries.
+ **style_kwds : dict
+ Style options to be passed on to the actual plot function, such
+ as ``edgecolor``, ``facecolor``, ``linewidth``, ``markersize``,
+ ``alpha``.
+
+ Returns
+ -------
+ ax : matplotlib axes instance
+
+ Examples
+ --------
+ >>> import geodatasets
+ >>> df = geopandas.read_file(geodatasets.get_path("nybb"))
+ >>> df.head() # doctest: +SKIP
+ BoroCode ... geometry
+ 0 5 ... MULTIPOLYGON (((970217.022 145643.332, 970227....
+ 1 4 ... MULTIPOLYGON (((1029606.077 156073.814, 102957...
+ 2 3 ... MULTIPOLYGON (((1021176.479 151374.797, 102100...
+ 3 1 ... MULTIPOLYGON (((981219.056 188655.316, 980940....
+ 4 2 ... MULTIPOLYGON (((1012821.806 229228.265, 101278...
+
+ >>> df.plot("BoroName", cmap="Set1") # doctest: +SKIP
+ """
+ return self.to_geopandas().plot(*args, **kwargs)
+
#
============================================================================
# SPATIAL OPERATIONS
#
============================================================================
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index 54bf9ecc6a..2075895681 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -1487,6 +1487,56 @@ class GeoSeries(GeoFrame, pspd.Series):
return self._query_geometry_column(spark_expr)
+ def plot(self, *args, **kwargs):
+ """
+ Plot a GeoSeries.
+
+ Generate a plot of a GeoSeries geometry with matplotlib.
+
+ Note: This method is not scalable and requires collecting all data to
the driver.
+
+ Parameters
+ ----------
+ s : Series
+ The GeoSeries to be plotted. Currently Polygon,
+ MultiPolygon, LineString, MultiLineString, Point and MultiPoint
+ geometries can be plotted.
+ cmap : str (default None)
+ The name of a colormap recognized by matplotlib. Any
+ colormap will work, but categorical colormaps are
+ generally recommended. Examples of useful discrete
+ colormaps include:
+
+ tab10, tab20, Accent, Dark2, Paired, Pastel1, Set1, Set2
+
+ color : str, np.array, pd.Series, List (default None)
+ If specified, all objects will be colored uniformly.
+ ax : matplotlib.pyplot.Artist (default None)
+ axes on which to draw the plot
+ figsize : pair of floats (default None)
+ Size of the resulting matplotlib.figure.Figure. If the argument
+ ax is given explicitly, figsize is ignored.
+ aspect : 'auto', 'equal', None or float (default 'auto')
+ Set aspect of axis. If 'auto', the default aspect for map plots is
'equal'; if
+ however data are not projected (coordinates are long/lat), the
aspect is by
+ default set to 1/cos(s_y * pi/180) with s_y the y coordinate of
the middle of
+ the GeoSeries (the mean of the y range of bounding box) so that a
long/lat
+ square appears square in the middle of the plot. This implies an
+ Equirectangular projection. If None, the aspect of `ax` won't be
changed. It can
+ also be set manually (float) as the ratio of y-unit to x-unit.
+ autolim : bool (default True)
+ Update axes data limits to contain the new geometries.
+ **style_kwds : dict
+ Color options to be passed on to the actual plot function, such
+ as ``edgecolor``, ``facecolor``, ``linewidth``, ``markersize``,
+ ``alpha``.
+
+ Returns
+ -------
+ ax : matplotlib axes instance
+ """
+ return self.to_geopandas().plot(*args, **kwargs)
+
def sjoin(
self,
other,
diff --git a/python/sedona/geopandas/io.py b/python/sedona/geopandas/io.py
index 3b4d8bdf02..a9da89eef7 100644
--- a/python/sedona/geopandas/io.py
+++ b/python/sedona/geopandas/io.py
@@ -236,3 +236,53 @@ def read_file(filename: str, format: Union[str, None] =
None, **kwargs):
internal = InternalFrame(spark_frame=sdf,
index_spark_columns=index_spark_columns)
return GeoDataFrame(ps.DataFrame(internal))
+
+
+def read_parquet(
+ path,
+ columns=None,
+ storage_options=None,
+ bbox=None,
+ to_pandas_kwargs=None,
+ **kwargs,
+):
+ """
+ Load a Parquet object from the file path, returning a GeoDataFrame.
+
+ * if no geometry columns are read, this will raise a ``ValueError`` - you
+ should use the pandas `read_parquet` method instead.
+
+ If 'crs' key is not present in the GeoParquet metadata associated with the
+ Parquet object, it will default to "OGC:CRS84" according to the
specification.
+
+ Parameters
+ ----------
+ path : str, path object
+ columns : list-like of strings, default=None
+ Not currently supported in Sedona
+ storage_options : dict, optional
+ Not currently supported in Sedona
+ bbox : tuple, optional
+ Not currently supported in Sedona
+ to_pandas_kwargs : dict, optional
+ Not currently supported in Sedona
+
+ Returns
+ -------
+ GeoDataFrame
+
+ Examples
+ --------
+ from sedona.geopandas import read_parquet
+ >>> df = read_parquet("data.parquet") # doctest: +SKIP
+
+ Specifying columns to read:
+
+ >>> df = read_parquet(
+ ... "data.parquet",
+ ... ) # doctest: +SKIP
+ """
+ if kwargs:
+ warnings.warn(f"The given arguments are not supported in Sedona:
{kwargs}")
+
+ return read_file(path, format="geoparquet", **kwargs)
diff --git a/python/tests/geopandas/test_geodataframe.py
b/python/tests/geopandas/test_geodataframe.py
index 6c7b7e7c11..abda5010c3 100644
--- a/python/tests/geopandas/test_geodataframe.py
+++ b/python/tests/geopandas/test_geodataframe.py
@@ -45,7 +45,7 @@ from packaging.version import parse as parse_version
parse_version(shapely.__version__) < parse_version("2.0.0"),
reason=f"Tests require shapely>=2.0.0, but found v{shapely.__version__}",
)
-class TestDataframe(TestGeopandasBase):
+class TestGeoDataFrame(TestGeopandasBase):
@pytest.mark.parametrize(
"obj",
[
@@ -126,6 +126,21 @@ class TestDataframe(TestGeopandasBase):
sgpd_df = sgpd.GeoDataFrame(obj)
assert_frame_equal(pd_df, sgpd_df.to_pandas())
+ def test_plot(self):
+ # Just make sure it doesn't error
+ df = GeoDataFrame(
+ {
+ "value1": ["a", "b", "c"],
+ "geometry": [
+ Point(0, 0),
+ Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+ LineString([(0, 0), (1, 1)]),
+ ],
+ "value2": [1, 2, 3],
+ }
+ )
+ df.plot()
+
def test_psdf(self):
# this is to make sure the spark session works with pandas on spark api
psdf = ps.DataFrame(
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 261ac304fb..d73545f08a 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -95,6 +95,10 @@ class TestGeoSeries(TestGeopandasBase):
sgpd_series = GeoSeries(obj)
assert isinstance(sgpd_series, sgpd.GeoSeries)
+ def test_plot(self):
+ # Just make sure it doesn't error
+ self.geoseries.plot()
+
def test_area(self):
result = self.geoseries.area.to_pandas()
expected = pd.Series([0.0, 0.0, 5.23, 5.23])
diff --git a/python/tests/geopandas/test_io.py
b/python/tests/geopandas/test_io.py
index 8e0ab68404..052216e15b 100644
--- a/python/tests/geopandas/test_io.py
+++ b/python/tests/geopandas/test_io.py
@@ -23,7 +23,7 @@ import pandas as pd
import geopandas as gpd
import pyspark.pandas as ps
from functools import partial
-from sedona.geopandas import GeoDataFrame, GeoSeries, read_file
+from sedona.geopandas import GeoDataFrame, GeoSeries, read_file, read_parquet
from tests import tests_resource
from tests.geopandas.test_geopandas_base import TestGeopandasBase
from shapely.geometry import (
@@ -102,6 +102,7 @@ class TestIO(TestGeopandasBase):
[
partial(GeoDataFrame.from_file, format="geoparquet"),
partial(read_file, format="GeoParquet"),
+ read_parquet,
],
)
def test_read_geoparquet(self, read_func):