(sedona) branch master updated: [GH-2049] Retain index information in query results (#2086)

jiayu Fri, 11 Jul 2025 10:33:15 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git



The following commit(s) were added to refs/heads/master by this push:
     new 652db60604 [GH-2049] Retain index information in query results (#2086)
652db60604 is described below

commit 652db606045a5436d7464b98ca6d3794c52a778d
Author: Peter Nguyen <[email protected]>
AuthorDate: Fri Jul 11 10:32:16 2025 -0700

    [GH-2049] Retain index information in query results (#2086)
    
    * Implement retain index information in results of queries
    
    * Update comment
---
 python/sedona/geopandas/geoseries.py               | 32 ++++++++++++++--------
 python/tests/geopandas/test_geoseries.py           | 18 ++++++++++--
 .../tests/geopandas/test_match_geopandas_series.py | 14 ++++++++++
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/python/sedona/geopandas/geoseries.py 
b/python/sedona/geopandas/geoseries.py
index 31cee8f7a0..06602b27ce 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -243,10 +243,9 @@ class GeoSeries(GeoFrame, pspd.Series):
         """
         from pyproj import CRS
 
-        tmp_df = self._process_geometry_column(
-            "ST_SRID", rename="crs", returns_geom=False
-        )
-        srid = tmp_df.take([0])[0]
+        tmp = self._process_geometry_column("ST_SRID", rename="crs", 
returns_geom=False)
+        ps_series = tmp.take([0])
+        srid = ps_series.iloc[0]
         # Sedona returns 0 if doesn't exist
         return CRS.from_user_input(srid) if srid != 0 and not pd.isna(srid) 
else None
 
@@ -494,14 +493,17 @@ class GeoSeries(GeoFrame, pspd.Series):
 
             query = f"{query} as `{rename}`"
 
-        sdf = df.selectExpr(query)
-        internal = InternalFrame(
+        # We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to 
regenerate it in the result
+        # We always select SPARK_DEFAULT_INDEX_NAME, to retain series index 
info
+        sdf = df.selectExpr(query, SPARK_DEFAULT_INDEX_NAME, 
NATURAL_ORDER_COLUMN_NAME)
+
+        internal = self._internal.copy(
             spark_frame=sdf,
-            index_spark_columns=None,
-            column_labels=[self._column_label],
+            index_fields=[self._internal.index_fields[0]],
+            index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
             data_spark_columns=[scol_for(sdf, rename)],
-            data_fields=[self._internal.data_fields[0]],
-            column_label_names=self._internal.column_label_names,
+            data_fields=[self._internal.data_fields[0].copy(name=rename)],
+            column_label_names=[(rename,)],
         )
         ps_series = first_series(PandasOnSparkDataFrame(internal))
 
@@ -1372,13 +1374,19 @@ class GeoSeries(GeoFrame, pspd.Series):
 
         assert isinstance(other, GeoSeries), f"Invalid type for other: 
{type(other)}"
 
-        # TODO: this does not yet support multi-index
+        # This code assumes there is only one index (SPARK_DEFAULT_INDEX_NAME)
+        # and would need to be updated if Sedona later supports multi-index
         df = self._internal.spark_frame.select(
             col(self.get_first_geometry_column()).alias("L"),
-            col(index_col),
+            # For the left side:
+            # - We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to 
regenerate it in the result
+            # - We always select SPARK_DEFAULT_INDEX_NAME, to retain series 
index info
+            col(NATURAL_ORDER_COLUMN_NAME),
+            col(SPARK_DEFAULT_INDEX_NAME),
         )
         other_df = other._internal.spark_frame.select(
             col(other.get_first_geometry_column()).alias("R"),
+            # for the right side, we only need the column that we are joining 
on
             col(index_col),
         )
         joined_df = df.join(other_df, on=index_col, how="outer")
diff --git a/python/tests/geopandas/test_geoseries.py 
b/python/tests/geopandas/test_geoseries.py
index 0f89d547ab..6aabf33f25 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -626,6 +626,8 @@ class TestGeoSeries(TestBase):
         expected = pd.Series([True, True, True, True])
 
     def test_intersection(self):
+        import pyspark.pandas as ps
+
         s = sgpd.GeoSeries(
             [
                 Polygon([(0, 0), (2, 2), (0, 2)]),
@@ -691,7 +693,15 @@ class TestGeoSeries(TestBase):
                 Point(0, 1),
             ],
             index=range(1, 6),
+            crs=4326,
         )
+
+        # Ensure the index is preserved when crs is set (previously an issue)
+        expected_index = ps.Index(range(1, 6))
+        ps.set_option("compute.ops_on_diff_frames", True)
+        assert s2.index.equals(expected_index)
+        ps.reset_option("compute.ops_on_diff_frames")
+
         result = s.intersection(s2, align=True)
         expected = gpd.GeoSeries(
             [
@@ -705,7 +715,7 @@ class TestGeoSeries(TestBase):
         )
         self.check_sgpd_equals_gpd(result, expected)
 
-        result = s.intersection(s2, align=False)
+        result = s2.intersection(s, align=False)
         expected = gpd.GeoSeries(
             [
                 Polygon([(0, 0), (0, 1), (1, 1), (0, 0)]),
@@ -713,10 +723,14 @@ class TestGeoSeries(TestBase):
                 Point(1, 1),
                 Point(1, 1),
                 Point(0, 1),
-            ]
+            ],
+            index=range(1, 6),  # left's index
         )
         self.check_sgpd_equals_gpd(result, expected)
 
+        # Ensure result of align=False retains the left's index
+        assert result.index.to_pandas().equals(expected.index)
+
     def test_intersection_all(self):
         pass
 
diff --git a/python/tests/geopandas/test_match_geopandas_series.py 
b/python/tests/geopandas/test_match_geopandas_series.py
index f19c04e83a..cbb61527c5 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -594,6 +594,20 @@ class TestMatchGeopandasSeries(TestBase):
             Polygon([(2, 0), (3, 0), (3, 3), (2, 3)]),
             Point(0, 0),
         ]
+
+        # Ensure resulting index behavior is correct for align=False (retain 
the left's index)
+        index1 = range(1, len(geometries) + 1)
+        index2 = range(len(geometries))
+        sgpd_result = GeoSeries(geometries, index1).intersection(
+            GeoSeries(geometries, index2), align=False
+        )
+
+        gpd_result = gpd.GeoSeries(geometries, index1).intersection(
+            gpd.GeoSeries(geometries, index2), align=False
+        )
+        self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+        assert sgpd_result.index.to_pandas().equals(gpd_result.index)
+
         for g1 in geometries:
             for g2 in geometries:
                 sgpd_result = GeoSeries(g1).intersection(GeoSeries(g2))

(sedona) branch master updated: [GH-2049] Retain index information in query results (#2086)

Reply via email to