This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 652db60604 [GH-2049] Retain index information in query results (#2086)
652db60604 is described below
commit 652db606045a5436d7464b98ca6d3794c52a778d
Author: Peter Nguyen <[email protected]>
AuthorDate: Fri Jul 11 10:32:16 2025 -0700
[GH-2049] Retain index information in query results (#2086)
* Implement retain index information in results of queries
* Update comment
---
python/sedona/geopandas/geoseries.py | 32 ++++++++++++++--------
python/tests/geopandas/test_geoseries.py | 18 ++++++++++--
.../tests/geopandas/test_match_geopandas_series.py | 14 ++++++++++
3 files changed, 50 insertions(+), 14 deletions(-)
diff --git a/python/sedona/geopandas/geoseries.py
b/python/sedona/geopandas/geoseries.py
index 31cee8f7a0..06602b27ce 100644
--- a/python/sedona/geopandas/geoseries.py
+++ b/python/sedona/geopandas/geoseries.py
@@ -243,10 +243,9 @@ class GeoSeries(GeoFrame, pspd.Series):
"""
from pyproj import CRS
- tmp_df = self._process_geometry_column(
- "ST_SRID", rename="crs", returns_geom=False
- )
- srid = tmp_df.take([0])[0]
+ tmp = self._process_geometry_column("ST_SRID", rename="crs",
returns_geom=False)
+ ps_series = tmp.take([0])
+ srid = ps_series.iloc[0]
# Sedona returns 0 if doesn't exist
return CRS.from_user_input(srid) if srid != 0 and not pd.isna(srid)
else None
@@ -494,14 +493,17 @@ class GeoSeries(GeoFrame, pspd.Series):
query = f"{query} as `{rename}`"
- sdf = df.selectExpr(query)
- internal = InternalFrame(
+ # We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to
regenerate it in the result
+ # We always select SPARK_DEFAULT_INDEX_NAME, to retain series index
info
+ sdf = df.selectExpr(query, SPARK_DEFAULT_INDEX_NAME,
NATURAL_ORDER_COLUMN_NAME)
+
+ internal = self._internal.copy(
spark_frame=sdf,
- index_spark_columns=None,
- column_labels=[self._column_label],
+ index_fields=[self._internal.index_fields[0]],
+ index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
data_spark_columns=[scol_for(sdf, rename)],
- data_fields=[self._internal.data_fields[0]],
- column_label_names=self._internal.column_label_names,
+ data_fields=[self._internal.data_fields[0].copy(name=rename)],
+ column_label_names=[(rename,)],
)
ps_series = first_series(PandasOnSparkDataFrame(internal))
@@ -1372,13 +1374,19 @@ class GeoSeries(GeoFrame, pspd.Series):
assert isinstance(other, GeoSeries), f"Invalid type for other:
{type(other)}"
- # TODO: this does not yet support multi-index
+ # This code assumes there is only one index (SPARK_DEFAULT_INDEX_NAME)
+ # and would need to be updated if Sedona later supports multi-index
df = self._internal.spark_frame.select(
col(self.get_first_geometry_column()).alias("L"),
- col(index_col),
+ # For the left side:
+ # - We always select NATURAL_ORDER_COLUMN_NAME, to avoid having to
regenerate it in the result
+ # - We always select SPARK_DEFAULT_INDEX_NAME, to retain series
index info
+ col(NATURAL_ORDER_COLUMN_NAME),
+ col(SPARK_DEFAULT_INDEX_NAME),
)
other_df = other._internal.spark_frame.select(
col(other.get_first_geometry_column()).alias("R"),
+ # for the right side, we only need the column that we are joining
on
col(index_col),
)
joined_df = df.join(other_df, on=index_col, how="outer")
diff --git a/python/tests/geopandas/test_geoseries.py
b/python/tests/geopandas/test_geoseries.py
index 0f89d547ab..6aabf33f25 100644
--- a/python/tests/geopandas/test_geoseries.py
+++ b/python/tests/geopandas/test_geoseries.py
@@ -626,6 +626,8 @@ class TestGeoSeries(TestBase):
expected = pd.Series([True, True, True, True])
def test_intersection(self):
+ import pyspark.pandas as ps
+
s = sgpd.GeoSeries(
[
Polygon([(0, 0), (2, 2), (0, 2)]),
@@ -691,7 +693,15 @@ class TestGeoSeries(TestBase):
Point(0, 1),
],
index=range(1, 6),
+ crs=4326,
)
+
+ # Ensure the index is preserved when crs is set (previously an issue)
+ expected_index = ps.Index(range(1, 6))
+ ps.set_option("compute.ops_on_diff_frames", True)
+ assert s2.index.equals(expected_index)
+ ps.reset_option("compute.ops_on_diff_frames")
+
result = s.intersection(s2, align=True)
expected = gpd.GeoSeries(
[
@@ -705,7 +715,7 @@ class TestGeoSeries(TestBase):
)
self.check_sgpd_equals_gpd(result, expected)
- result = s.intersection(s2, align=False)
+ result = s2.intersection(s, align=False)
expected = gpd.GeoSeries(
[
Polygon([(0, 0), (0, 1), (1, 1), (0, 0)]),
@@ -713,10 +723,14 @@ class TestGeoSeries(TestBase):
Point(1, 1),
Point(1, 1),
Point(0, 1),
- ]
+ ],
+ index=range(1, 6), # left's index
)
self.check_sgpd_equals_gpd(result, expected)
+ # Ensure result of align=False retains the left's index
+ assert result.index.to_pandas().equals(expected.index)
+
def test_intersection_all(self):
pass
diff --git a/python/tests/geopandas/test_match_geopandas_series.py
b/python/tests/geopandas/test_match_geopandas_series.py
index f19c04e83a..cbb61527c5 100644
--- a/python/tests/geopandas/test_match_geopandas_series.py
+++ b/python/tests/geopandas/test_match_geopandas_series.py
@@ -594,6 +594,20 @@ class TestMatchGeopandasSeries(TestBase):
Polygon([(2, 0), (3, 0), (3, 3), (2, 3)]),
Point(0, 0),
]
+
+ # Ensure resulting index behavior is correct for align=False (retain
the left's index)
+ index1 = range(1, len(geometries) + 1)
+ index2 = range(len(geometries))
+ sgpd_result = GeoSeries(geometries, index1).intersection(
+ GeoSeries(geometries, index2), align=False
+ )
+
+ gpd_result = gpd.GeoSeries(geometries, index1).intersection(
+ gpd.GeoSeries(geometries, index2), align=False
+ )
+ self.check_sgpd_equals_gpd(sgpd_result, gpd_result)
+ assert sgpd_result.index.to_pandas().equals(gpd_result.index)
+
for g1 in geometries:
for g2 in geometries:
sgpd_result = GeoSeries(g1).intersection(GeoSeries(g2))