This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new d19c6e8 chore(python/sedonadb): Add integration tests for GeoParquet
reader (#6)
d19c6e8 is described below
commit d19c6e8580613c7a1baf05fe2bdcc8e9c599e1aa
Author: Dewey Dunnington <[email protected]>
AuthorDate: Fri Aug 29 21:51:09 2025 +0000
chore(python/sedonadb): Add integration tests for GeoParquet reader (#6)
* add sedona-testing submodule
* integration test for geoparquet
* also check file-level pruning
* license
* format
* maybe fix test failure
---
.github/workflows/python.yml | 9 +-
.gitmodules | 3 +
python/sedonadb/python/sedonadb/testing.py | 33 ++++++-
python/sedonadb/tests/conftest.py | 5 ++
python/sedonadb/tests/io/test_parquet.py | 134 +++++++++++++++++++++++++++++
submodules/download-assets.py | 63 ++++++++++----
submodules/sedona-testing | 1 +
7 files changed, 224 insertions(+), 24 deletions(-)
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 29e64bc..9ef7d16 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -14,6 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+
name: python
on:
@@ -73,14 +74,18 @@ jobs:
run: |
pip install -e "python/sedonadb/[test]" -vv
+ - name: Download minimal geoarrow-data assets
+ run: |
+ python submodules/download-assets.py "*water-junc*" "*water-point*"
+
- name: Start PostGIS
run: |
docker compose up --wait --detach postgis
- name: Run tests
env:
- # Ensure that there are no skips for PostGIS or duckdb not available
- SEDONADB_PYTHON_ENSURE_ALL_ENGINES: "true"
+ # Ensure that we don't skip tests that we didn't intend to
+ SEDONADB_PYTHON_NO_SKIP_TESTS: "true"
run: |
cd python
python -m pytest -vv
diff --git a/.gitmodules b/.gitmodules
index 6383224..2923807 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,3 +23,6 @@
[submodule "c/sedona-s2geography/s2geometry"]
path = c/sedona-s2geography/s2geometry
url = https://github.com/google/s2geometry.git
+[submodule "submodules/sedona-testing"]
+ path = submodules/sedona-testing
+ url = https://github.com/apache/sedona-testing.git
diff --git a/python/sedonadb/python/sedonadb/testing.py
b/python/sedonadb/python/sedonadb/testing.py
index 6d151be..fe90910 100644
--- a/python/sedonadb/python/sedonadb/testing.py
+++ b/python/sedonadb/python/sedonadb/testing.py
@@ -16,6 +16,7 @@
# under the License.
import os
import warnings
+from pathlib import Path
from typing import TYPE_CHECKING, List, Tuple
import geoarrow.pyarrow as ga
@@ -27,6 +28,29 @@ if TYPE_CHECKING:
import sedonadb
+def skip_if_not_exists(path: Path):
+ """Skip a test using pytest.skip() if path does not exist
+
+ If SEDONADB_PYTHON_NO_SKIP_TESTS is set, this function will never skip to
+ avoid accidentally skipping tests on CI.
+ """
+ if _env_no_skip():
+ return
+
+ if not path.exists():
+ import pytest
+
+ pytest.skip(
+ f"Test asset '{path}' not found. "
+ "Run submodules/download-assets.py to test with submodule assets"
+ )
+
+
+def _env_no_skip():
+ env_no_skip = os.environ.get("SEDONADB_PYTHON_NO_SKIP_TESTS", "false")
+ return env_no_skip in ("true", "1")
+
+
class DBEngine:
"""Engine-agnostic catalog and SQL engine
@@ -59,12 +83,14 @@ class DBEngine:
This is the constructor that should be used in tests to ensure that
integration
style tests don't cause failure for contributors working on Python-only
behaviour.
+
+ If SEDONADB_PYTHON_NO_SKIP_TESTS is set, this function will never skip
to
+ avoid accidentally skipping tests on CI.
"""
import pytest
# Ensure we can force this to succeed (or fail in CI)
- env_no_skip = os.environ.get("SEDONADB_PYTHON_ENSURE_ALL_ENGINES",
"false")
- if env_no_skip in ("true", "1"):
+ if _env_no_skip():
return cls(*args, **kwargs)
# By default, allow construction to fail (e.g., for contributors
running
@@ -283,7 +309,8 @@ class SedonaDB(DBEngine):
return self
def execute_and_collect(self, query) -> "sedonadb.dataframe.DataFrame":
- return self.con.sql(query).collect()
+ # Use to_arrow_table() to maintain ordering of the input table
+ return self.con.sql(query).to_arrow_table()
class DuckDB(DBEngine):
diff --git a/python/sedonadb/tests/conftest.py
b/python/sedonadb/tests/conftest.py
index 35b3515..9b0cc2b 100644
--- a/python/sedonadb/tests/conftest.py
+++ b/python/sedonadb/tests/conftest.py
@@ -33,3 +33,8 @@ def con():
@pytest.fixture()
def geoarrow_data():
return HERE.parent.parent.parent / "submodules" / "geoarrow-data"
+
+
[email protected]()
+def sedona_testing():
+ return HERE.parent.parent.parent / "submodules" / "sedona-testing"
diff --git a/python/sedonadb/tests/io/test_parquet.py
b/python/sedonadb/tests/io/test_parquet.py
new file mode 100644
index 0000000..669334d
--- /dev/null
+++ b/python/sedonadb/tests/io/test_parquet.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import tempfile
+import shapely
+import geopandas
+from pyarrow import parquet
+from pathlib import Path
+from sedonadb.testing import geom_or_null, SedonaDB, DuckDB, skip_if_not_exists
+
+
[email protected]("name", ["water-junc", "water-point"])
+def test_read_whole_geoparquet(geoarrow_data, name):
+ # Checks a read of some non-trivial files and ensures we match a GeoPandas
read
+ eng = SedonaDB()
+ path = geoarrow_data / "ns-water" / "files" /
f"ns-water_{name}_geo.parquet"
+ skip_if_not_exists(path)
+
+ gdf =
geopandas.read_parquet(path).sort_values(by="OBJECTID").reset_index(drop=True)
+
+ eng.create_view_parquet("tab", path)
+ result = eng.execute_and_collect("""SELECT * FROM tab ORDER BY
"OBJECTID";""")
+ eng.assert_result(result, gdf)
+
+
[email protected]("name", ["geoparquet-1.0.0", "geoparquet-1.1.0",
"plain"])
+def test_read_sedona_testing(sedona_testing, name):
+ # Checks a read of trivial files (some GeoParquet and some not) against a
DuckDB read
+ duckdb = DuckDB.create_or_skip()
+ sedonadb = SedonaDB()
+ path = sedona_testing / "data" / "parquet" / f"{name}.parquet"
+ skip_if_not_exists(path)
+
+ duckdb.create_view_parquet("tab", path)
+ result_duckdb = duckdb.execute_and_collect("SELECT * FROM tab")
+ df_duckdb = duckdb.result_to_pandas(result_duckdb)
+
+ # DuckDB never returns CRSes
+ kwargs = {}
+ if isinstance(df_duckdb, geopandas.GeoDataFrame):
+ kwargs["check_crs"] = False
+
+ sedonadb.create_view_parquet("tab", path)
+ sedonadb.assert_query_result("SELECT * FROM tab", df_duckdb, **kwargs)
+
+
[email protected]("name", ["water-junc", "water-point"])
+def test_read_geoparquet_pruned(geoarrow_data, name):
+ # Note that this doesn't check that pruning actually occurred, just that
+ # for a query where we should be pruning automatically that we don't omit
results.
+ eng = SedonaDB()
+ path = geoarrow_data / "ns-water" / "files" /
f"ns-water_{name}_geo.parquet"
+ skip_if_not_exists(path)
+
+ # Roughly a diamond around Gaspereau Lake, Nova Scotia, in UTM zone 20
+ wkt_filter = """
+ POLYGON ((
+ 371000 4978000, 376000 4972000, 381000 4978000,
+ 376000 4983000, 371000 4978000
+ ))
+ """
+ poly_filter = shapely.from_wkt(wkt_filter)
+
+ gdf = geopandas.read_parquet(path)
+ gdf = (
+ gdf[gdf.geometry.intersects(poly_filter)]
+ .sort_values(by="OBJECTID")
+ .reset_index(drop=True)
+ )
+ gdf = gdf[["OBJECTID", "geometry"]]
+
+ # Make sure this isn't a bogus test
+ assert len(gdf) > 0
+
+ with tempfile.TemporaryDirectory() as td:
+ # Write using GeoPandas, which implements GeoParquet 1.1 bbox covering
+ # Write tiny row groups so that many bounding boxes have to be checked
+ tmp_parquet = Path(td) / f"{name}.parquet"
+ geopandas.read_parquet(path).to_parquet(
+ tmp_parquet,
+ schema_version="1.1.0",
+ write_covering_bbox=True,
+ row_group_size=1024,
+ )
+
+ eng.create_view_parquet("tab", tmp_parquet)
+ result = eng.execute_and_collect(
+ f"""
+ SELECT "OBJECTID", geometry FROM tab
+ WHERE ST_Intersects(geometry,
ST_SetSRID({geom_or_null(wkt_filter)}, '{gdf.crs.to_json()}'))
+ ORDER BY "OBJECTID";
+ """
+ )
+ eng.assert_result(result, gdf)
+
+ # Write a dataset with one file per row group to check file pruning
correctness
+ ds_dir = Path(td) / "ds"
+ ds_dir.mkdir()
+ ds_paths = []
+
+ with parquet.ParquetFile(tmp_parquet) as f:
+ for i in range(f.metadata.num_row_groups):
+ tab = f.read_row_group(i, ["OBJECTID", "geometry"])
+ df = geopandas.GeoDataFrame.from_arrow(tab)
+ ds_path = ds_dir / f"file{i}.parquet"
+ df.to_parquet(ds_path)
+ ds_paths.append(ds_path)
+
+ # Check a query against the same dataset without the bbox column but
with file-level
+ # geoparquet metadata bounding boxes
+ eng.create_view_parquet("tab_dataset", ds_paths)
+ result = eng.execute_and_collect(
+ f"""
+ SELECT * FROM tab_dataset
+ WHERE ST_Intersects(geometry,
ST_SetSRID({geom_or_null(wkt_filter)}, '{gdf.crs.to_json()}'))
+ ORDER BY "OBJECTID";
+ """
+ )
+ eng.assert_result(result, gdf)
diff --git a/submodules/download-assets.py b/submodules/download-assets.py
index 35b824a..03c69f8 100644
--- a/submodules/download-assets.py
+++ b/submodules/download-assets.py
@@ -14,6 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+
import json
import urllib.request
import shutil
@@ -22,22 +23,46 @@ from pathlib import Path
HERE = Path(__file__).parent
-with open(HERE / "geoarrow-data" / "manifest.json") as f:
- manifest = json.load(f)
-
-for group in manifest["groups"]:
- group_name = group["name"]
- for file in group["files"]:
- url = file["url"]
- filename = Path(url).name
- local_path = HERE / "geoarrow-data" / group_name / "files" / filename
- if local_path.exists():
- print(f"Using cached '{filename}'")
- elif file["format"] in ("parquet", "geoparquet"):
- # Only download Parquet/GeoParquet versions of asset files to save
space
- print(f"Downloading {url}")
- local_path.parent.mkdir(parents=True, exist_ok=True)
- with urllib.request.urlopen(url) as fin, open(local_path, "wb") as
fout:
- shutil.copyfileobj(fin, fout)
-
-print("Done!")
+
+def download_files_lazy(include):
+ with open(HERE / "geoarrow-data" / "manifest.json") as f:
+ manifest = json.load(f)
+
+ for group in manifest["groups"]:
+ group_name = group["name"]
+ for file in group["files"]:
+ url = file["url"]
+ filename = Path(url).name
+ local_path = HERE / "geoarrow-data" / group_name / "files" /
filename
+ if not path_match(local_path, include):
+ continue
+
+ if local_path.exists():
+ print(f"Using cached '{filename}'")
+ elif file["format"] in ("parquet", "geoparquet"):
+ # Only download Parquet/GeoParquet versions of asset files to
save space
+ print(f"Downloading {url}")
+ local_path.parent.mkdir(parents=True, exist_ok=True)
+ with urllib.request.urlopen(url) as fin, open(local_path,
"wb") as fout:
+ shutil.copyfileobj(fin, fout)
+
+ print("Done!")
+
+
+def path_match(path, include):
+ for pattern in include:
+ if path.match(pattern):
+ return True
+
+ return False
+
+
+if __name__ == "__main__":
+ import sys
+
+ include_patterns = sys.argv[1:]
+
+ if not include_patterns:
+ include_patterns = ["*"]
+
+ download_files_lazy(include_patterns)
diff --git a/submodules/sedona-testing b/submodules/sedona-testing
new file mode 160000
index 0000000..5073f74
--- /dev/null
+++ b/submodules/sedona-testing
@@ -0,0 +1 @@
+Subproject commit 5073f7405a6aa2b8eb326da94356802dca956a6a