This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new da3eacde6b [SEDONA-717] Fix `dataframe_to_arrow()` for zero-row
results (#1840)
da3eacde6b is described below
commit da3eacde6b7e9d7bc68e28f53440cf93c608b5ff
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Mar 4 18:41:39 2025 -0600
[SEDONA-717] Fix `dataframe_to_arrow()` for zero-row results (#1840)
* fix zero-row case
* typo
* fix lint
---
python/sedona/utils/geoarrow.py | 28 ++++++++++++++++++++++++++--
python/tests/utils/test_geoarrow.py | 6 ++++++
2 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/python/sedona/utils/geoarrow.py b/python/sedona/utils/geoarrow.py
index 353b4ff7f8..b4a539dfa4 100644
--- a/python/sedona/utils/geoarrow.py
+++ b/python/sedona/utils/geoarrow.py
@@ -71,9 +71,9 @@ def dataframe_to_arrow(df, crs=None):
# Using the extension type ensures that the type and its metadata will
# propagate through all pyarrow transformations.
import geoarrow.types as gat
- from geoarrow.types.type_pyarrow import register_extension_types
- register_extension_types()
+ try_register_extension_types()
+
spec = gat.wkb()
new_cols = [
@@ -117,7 +117,16 @@ def dataframe_to_arrow_raw(df):
self_destruct = jconf.arrowPySparkSelfDestructEnabled()
batches = df._collect_as_arrow(split_batches=self_destruct)
+
+ # The zero row case can use from_batches() with schema (nothing to cast)
+ if not batches:
+ return pa.Table.from_batches([], schema)
+
+ # When batches were returned, use cast(schema). This was backported from
+ # Spark, where presumably there is a good reason that the schemas of
batches
+ # may not necessarily align with that of schema (thus a cast is required)
table = pa.Table.from_batches(batches).cast(schema)
+
# Ensure only the table has a reference to the batches, so that
# self_destruct (if enabled) is effective
del batches
@@ -161,6 +170,21 @@ def crs_to_json(crs):
return pyproj.CRS(crs).to_json()
+def try_register_extension_types():
+ """Try to register extension types using geoarrow-types
+
+ Do this defensively, because it can fail if the extension type was
+ registered in some other way (notably: old versions of geoarrow-pyarrow,
+ which is a dependency of Kepler).
+ """
+ from geoarrow.types.type_pyarrow import register_extension_types
+
+ try:
+ register_extension_types()
+ except RuntimeError:
+ pass
+
+
def unique_srid_from_ewkb(obj):
import pyarrow as pa
import pyarrow.compute as pc
diff --git a/python/tests/utils/test_geoarrow.py
b/python/tests/utils/test_geoarrow.py
index e84dea84f4..28499c401c 100644
--- a/python/tests/utils/test_geoarrow.py
+++ b/python/tests/utils/test_geoarrow.py
@@ -37,6 +37,12 @@ class TestGeoArrow(TestBase):
wkt_table = dataframe_to_arrow(wkt_df)
assert wkt_table == pa.table({"wkt": TEST_WKT})
+ def test_to_geoarrow_zero_rows(self):
+ schema = StructType().add("wkt", StringType())
+ wkt_df = TestGeoArrow.spark.createDataFrame(zip(TEST_WKT),
schema).limit(0)
+ wkt_table = dataframe_to_arrow(wkt_df)
+ assert wkt_table == pa.table({"wkt": pa.array([], pa.utf8())})
+
def test_to_geoarrow_with_geometry(self):
schema = StructType().add("wkt", StringType())
wkt_df = TestGeoArrow.spark.createDataFrame(zip(TEST_WKT), schema)