This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new da3eacde6b [SEDONA-717] Fix `dataframe_to_arrow()` for zero-row 
results (#1840)
da3eacde6b is described below

commit da3eacde6b7e9d7bc68e28f53440cf93c608b5ff
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Mar 4 18:41:39 2025 -0600

    [SEDONA-717] Fix `dataframe_to_arrow()` for zero-row results (#1840)
    
    * fix zero-row case
    
    * typo
    
    * fix lint
---
 python/sedona/utils/geoarrow.py     | 28 ++++++++++++++++++++++++++--
 python/tests/utils/test_geoarrow.py |  6 ++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/python/sedona/utils/geoarrow.py b/python/sedona/utils/geoarrow.py
index 353b4ff7f8..b4a539dfa4 100644
--- a/python/sedona/utils/geoarrow.py
+++ b/python/sedona/utils/geoarrow.py
@@ -71,9 +71,9 @@ def dataframe_to_arrow(df, crs=None):
         # Using the extension type ensures that the type and its metadata will
         # propagate through all pyarrow transformations.
         import geoarrow.types as gat
-        from geoarrow.types.type_pyarrow import register_extension_types
 
-        register_extension_types()
+        try_register_extension_types()
+
         spec = gat.wkb()
 
         new_cols = [
@@ -117,7 +117,16 @@ def dataframe_to_arrow_raw(df):
 
     self_destruct = jconf.arrowPySparkSelfDestructEnabled()
     batches = df._collect_as_arrow(split_batches=self_destruct)
+
+    # The zero row case can use from_batches() with schema (nothing to cast)
+    if not batches:
+        return pa.Table.from_batches([], schema)
+
+    # When batches were returned, use cast(schema). This was backported from
+    # Spark, where presumably there is a good reason that the schemas of 
batches
+    # may not necessarily align with that of schema (thus a cast is required)
     table = pa.Table.from_batches(batches).cast(schema)
+
     # Ensure only the table has a reference to the batches, so that
     # self_destruct (if enabled) is effective
     del batches
@@ -161,6 +170,21 @@ def crs_to_json(crs):
         return pyproj.CRS(crs).to_json()
 
 
+def try_register_extension_types():
+    """Try to register extension types using geoarrow-types
+
+    Do this defensively, because it can fail if the extension type was
+    registered in some other way (notably: old versions of geoarrow-pyarrow,
+    which is a dependency of Kepler).
+    """
+    from geoarrow.types.type_pyarrow import register_extension_types
+
+    try:
+        register_extension_types()
+    except RuntimeError:
+        pass
+
+
 def unique_srid_from_ewkb(obj):
     import pyarrow as pa
     import pyarrow.compute as pc
diff --git a/python/tests/utils/test_geoarrow.py 
b/python/tests/utils/test_geoarrow.py
index e84dea84f4..28499c401c 100644
--- a/python/tests/utils/test_geoarrow.py
+++ b/python/tests/utils/test_geoarrow.py
@@ -37,6 +37,12 @@ class TestGeoArrow(TestBase):
         wkt_table = dataframe_to_arrow(wkt_df)
         assert wkt_table == pa.table({"wkt": TEST_WKT})
 
+    def test_to_geoarrow_zero_rows(self):
+        schema = StructType().add("wkt", StringType())
+        wkt_df = TestGeoArrow.spark.createDataFrame(zip(TEST_WKT), 
schema).limit(0)
+        wkt_table = dataframe_to_arrow(wkt_df)
+        assert wkt_table == pa.table({"wkt": pa.array([], pa.utf8())})
+
     def test_to_geoarrow_with_geometry(self):
         schema = StructType().add("wkt", StringType())
         wkt_df = TestGeoArrow.spark.createDataFrame(zip(TEST_WKT), schema)

Reply via email to