This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
The following commit(s) were added to refs/heads/main by this push:
new b68b814 feature: Add Spatial Polars to the automated benchmark
framework (#73)
b68b814 is described below
commit b68b81404c9b4e5f5c2699c0c78a90b5dd4ba7b6
Author: Jia Yu <[email protected]>
AuthorDate: Wed Jan 14 21:15:01 2026 -0700
feature: Add Spatial Polars to the automated benchmark framework (#73)
---
.github/workflows/benchmark.yml | 75 +++++++++++++++++++++++++++++++++++++----
benchmark/run_benchmark.py | 54 +++++++++++++++++++++++++----
benchmark/summarize_results.py | 1 +
3 files changed, 117 insertions(+), 13 deletions(-)
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 8b5c81a..a68289c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -32,7 +32,7 @@ on:
engines:
description: 'Engines to benchmark (comma-separated)'
required: false
- default: 'duckdb,geopandas,sedonadb'
+ default: 'duckdb,geopandas,sedonadb,spatial_polars'
type: string
timeout:
description: 'Query timeout in seconds (default: 60, increase for full
benchmark)'
@@ -54,6 +54,11 @@ on:
required: false
default: ''
type: string
+ spatial_polars_version:
+ description: 'Spatial Polars version (e.g., 1.0.0, leave empty for
latest)'
+ required: false
+ default: ''
+ type: string
runs:
description: 'Number of runs per query (average taken for fair
comparison)'
required: false
@@ -71,13 +76,14 @@ concurrency:
env:
CARGO_TERM_COLOR: always
SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '1' }}
- BENCHMARK_ENGINES: ${{ github.event.inputs.engines ||
'duckdb,geopandas,sedonadb' }}
+ BENCHMARK_ENGINES: ${{ github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars' }}
QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '60' }}
BENCHMARK_RUNS: ${{ github.event.inputs.runs || '3' }}
# Package versions (empty = latest, can be overridden via workflow_dispatch)
SEDONADB_VERSION: ${{ github.event.inputs.sedonadb_version }}
DUCKDB_VERSION: ${{ github.event.inputs.duckdb_version }}
GEOPANDAS_VERSION: ${{ github.event.inputs.geopandas_version }}
+ SPATIAL_POLARS_VERSION: ${{ github.event.inputs.spatial_polars_version }}
# Hugging Face dataset for benchmark data
HF_DATASET: apache-sedona/spatialbench
HF_DATA_VERSION: v0.1.0
@@ -170,7 +176,7 @@ jobs:
name: Benchmark DuckDB (SF${{ github.event.inputs.scale_factor || '1' }})
needs: download-data
runs-on: ubuntu-latest
- if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb',
'duckdb')
+ if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb')
steps:
- uses: actions/checkout@v4
@@ -220,7 +226,7 @@ jobs:
name: Benchmark GeoPandas (SF${{ github.event.inputs.scale_factor || '1'
}})
needs: download-data
runs-on: ubuntu-latest
- if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb',
'geopandas')
+ if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas')
steps:
- uses: actions/checkout@v4
@@ -266,7 +272,7 @@ jobs:
name: Benchmark SedonaDB (SF${{ github.event.inputs.scale_factor || '1' }})
needs: download-data
runs-on: ubuntu-latest
- if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb',
'sedonadb')
+ if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb')
steps:
- uses: actions/checkout@v4
@@ -308,10 +314,56 @@ jobs:
path: sedonadb_results.json
retention-days: 30
+ benchmark-spatial-polars:
+ name: Benchmark Spatial Polars (SF${{ github.event.inputs.scale_factor ||
'1' }})
+ needs: download-data
+ runs-on: ubuntu-latest
+ if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars')
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Restore benchmark data from cache
+ uses: actions/cache/restore@v4
+ with:
+ path: benchmark-data-sf${{ env.SCALE_FACTOR }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ fail-on-cache-miss: true
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install dependencies
+ run: |
+ if [ -n "${{ env.SPATIAL_POLARS_VERSION }}" ]; then
+ pip install "spatial-polars[knn]==${{ env.SPATIAL_POLARS_VERSION
}}" pyarrow
+ else
+ pip install "spatial-polars[knn]" pyarrow
+ fi
+ echo "Installed Spatial Polars version: $(python -c 'from
importlib.metadata import version; print(version(\"spatial-polars\"))')"
+
+ - name: Run Spatial Polars benchmark
+ run: |
+ python benchmark/run_benchmark.py \
+ --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --engines spatial_polars \
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --runs ${{ env.BENCHMARK_RUNS }} \
+ --scale-factor ${{ env.SCALE_FACTOR }} \
+ --output spatial_polars_results.json
+
+ - name: Upload results
+ uses: actions/upload-artifact@v4
+ with:
+ name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
+ path: spatial_polars_results.json
+ retention-days: 30
+
summarize-results:
name: Summarize Results (SF${{ github.event.inputs.scale_factor || '1' }})
- needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb]
- if: always() && (needs.benchmark-duckdb.result == 'success' ||
needs.benchmark-geopandas.result == 'success' ||
needs.benchmark-sedonadb.result == 'success')
+ needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb,
benchmark-spatial-polars]
+ if: always() && (needs.benchmark-duckdb.result == 'success' ||
needs.benchmark-geopandas.result == 'success' ||
needs.benchmark-sedonadb.result == 'success' ||
needs.benchmark-spatial-polars.result == 'success')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
@@ -333,12 +385,21 @@ jobs:
continue-on-error: true
- name: Download SedonaDB results
+ if: needs.benchmark-sedonadb.result == 'success'
uses: actions/download-artifact@v4
with:
name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
path: results
continue-on-error: true
+ - name: Download Spatial Polars results
+ if: needs.benchmark-spatial-polars.result == 'success'
+ uses: actions/download-artifact@v4
+ with:
+ name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
+ path: results
+ continue-on-error: true
+
- name: Setup Python
uses: actions/setup-python@v5
with:
diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py
index 4b459f4..fca3c8e 100644
--- a/benchmark/run_benchmark.py
+++ b/benchmark/run_benchmark.py
@@ -37,7 +37,8 @@ from pathlib import Path
from typing import Any, Callable
# Add spatialbench-queries directory to path to import query modules
-sys.path.insert(0, str(Path(__file__).parent.parent / "spatialbench-queries"))
+# Use append (not insert) so installed packages like spatial_polars are found
first
+sys.path.append(str(Path(__file__).parent.parent / "spatialbench-queries"))
# Constants
QUERY_COUNT = 12
@@ -103,6 +104,10 @@ def _run_query_in_process(
too much memory, which SIGALRM cannot do for native code.
"""
try:
+ # For Spatial Polars, ensure the package is imported first to register
namespace
+ if engine_class.__name__ == "SpatialPolarsBenchmark":
+ import spatial_polars as _sp # noqa: F401
+
benchmark = engine_class(data_paths)
benchmark.setup()
try:
@@ -310,6 +315,35 @@ class SedonaDBBenchmark(BaseBenchmark):
return len(result), result
+class SpatialPolarsBenchmark(BaseBenchmark):
+ """Spatial Polars benchmark runner."""
+
+ def __init__(self, data_paths: dict[str, str]):
+ super().__init__(data_paths, "spatial_polars")
+ self._queries = None
+
+ def setup(self) -> None:
+ # spatial_polars package is already imported in _run_query_in_process
+ # to register .spatial namespace before any module loading
+
+ # Load query functions directly from the module
+ import importlib.util
+ query_file = Path(__file__).parent.parent / "spatialbench-queries" /
"spatial_polars.py"
+ spec =
importlib.util.spec_from_file_location("spatial_polars_queries", query_file)
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ self._queries = {f"q{i}": getattr(module, f"q{i}") for i in range(1,
QUERY_COUNT + 1)}
+
+ def teardown(self) -> None:
+ self._queries = None
+
+ def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
+ if query_name not in self._queries:
+ raise ValueError(f"Query {query_name} not found")
+ result = self._queries[query_name](self.data_paths)
+ return len(result), result
+
+
def get_sql_queries(dialect: str) -> dict[str, str]:
"""Get SQL queries for a specific dialect from print_queries.py."""
from print_queries import DuckDBSpatialBenchBenchmark,
SedonaDBSpatialBenchBenchmark
@@ -425,15 +459,23 @@ def run_benchmark(
"version_getter": lambda: pkg_version("sedonadb"),
"queries_getter": lambda: get_sql_queries("sedonadb"),
},
+ "spatial_polars": {
+ "class": SpatialPolarsBenchmark,
+ "version_getter": lambda: pkg_version("spatial-polars"),
+ "queries_getter": lambda: {f"q{i}": None for i in range(1,
QUERY_COUNT + 1)},
+ },
}
config = configs[engine]
version = config["version_getter"]()
+ # Format engine name for display
+ display_name = engine.replace("_", " ").title()
+
print(f"\n{'=' * 60}")
- print(f"Running {engine.title()} Benchmark")
+ print(f"Running {display_name} Benchmark")
print(f"{'=' * 60}")
- print(f"{engine.title()} version: {version}")
+ print(f"{display_name} version: {version}")
if runs > 1:
print(f"Runs per query: {runs} (average will be reported)")
@@ -548,11 +590,11 @@ def save_results(results: list[BenchmarkSuite],
output_file: str) -> None:
def main():
parser = argparse.ArgumentParser(
- description="Run SpatialBench benchmarks comparing SedonaDB, DuckDB,
and GeoPandas"
+ description="Run SpatialBench benchmarks comparing SedonaDB, DuckDB,
GeoPandas, and Spatial Polars"
)
parser.add_argument("--data-dir", type=str, required=True,
help="Path to directory containing benchmark data
(parquet files)")
- parser.add_argument("--engines", type=str, default="duckdb,geopandas",
+ parser.add_argument("--engines", type=str,
default="duckdb,geopandas,sedonadb,spatial_polars",
help="Comma-separated list of engines to benchmark")
parser.add_argument("--queries", type=str, default=None,
help="Comma-separated list of queries to run (e.g.,
q1,q2,q3)")
@@ -568,7 +610,7 @@ def main():
args = parser.parse_args()
engines = [e.strip().lower() for e in args.engines.split(",")]
- valid_engines = {"duckdb", "geopandas", "sedonadb"}
+ valid_engines = {"duckdb", "geopandas", "sedonadb", "spatial_polars"}
for e in engines:
if e not in valid_engines:
diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py
index d324e96..5c08707 100644
--- a/benchmark/summarize_results.py
+++ b/benchmark/summarize_results.py
@@ -99,6 +99,7 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"sedonadb": "🌵 SedonaDB",
"duckdb": "🦆 DuckDB",
"geopandas": "🐼 GeoPandas",
+ "spatial_polars": "🐻❄️ Spatial Polars",
}
# Generate markdown