This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
The following commit(s) were added to refs/heads/main by this push:
new 8331bb7 [CI] Add pre-commit with basic checks (#77)
8331bb7 is described below
commit 8331bb746f81650f1f014bc470f309df9f3ddad5
Author: John Bampton <[email protected]>
AuthorDate: Fri Jan 23 07:35:31 2026 +1000
[CI] Add pre-commit with basic checks (#77)
Run pre-commit and some clean ups were:
- chmod +x Python scripts in benchmark directory
- trailing whitespace cleanup in Python scripts in benchmark directory
- end of files cleaned up
---
.asf.yaml | 2 +-
.github/workflows/build-py-packages.yml | 2 +-
.github/workflows/packaging.yml | 2 +-
.github/workflows/pre-commit.yml | 53 ++++++++
.../workflows/spatialbench-cli-publish-pypi.yml | 2 +-
.github/workflows/spatialbench-publish-crates.yml | 2 +-
.gitignore | 2 +-
.pre-commit-config.yaml | 91 ++++++++++++++
CONTRIBUTING.md | 2 +-
LICENSE | 2 +-
benchmark/run_benchmark.py | 134 ++++++++++-----------
benchmark/summarize_results.py | 68 +++++------
dev/release/README.md | 2 +-
dev/release/check-rat-report.py | 1 -
dev/release/run-rat.sh | 2 -
dev/release/verify-release-candidate.sh | 1 -
docs/contributors-guide.md | 2 +-
docs/javascripts/katex.js | 2 +-
docs/queries.md | 1 -
docs/requirements.txt | 4 -
docs/stylesheets/extra.css | 2 +-
raster/generator.py | 2 +-
spatialbench-arrow/README.md | 2 +-
spatialbench-config.yml | 2 +-
spatialbench-queries/print_queries.py | 2 +-
spatialbench/data/README.md | 2 +-
26 files changed, 262 insertions(+), 127 deletions(-)
diff --git a/.asf.yaml b/.asf.yaml
index a535528..0bbdb43 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -54,4 +54,4 @@ github:
# Enable projects for project management boards
projects: false
# Enable GitHub discussion
- discussions: false
\ No newline at end of file
+ discussions: false
diff --git a/.github/workflows/build-py-packages.yml
b/.github/workflows/build-py-packages.yml
index e95b201..4cb0805 100644
--- a/.github/workflows/build-py-packages.yml
+++ b/.github/workflows/build-py-packages.yml
@@ -132,4 +132,4 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
- path: spatialbench-cli/dist
\ No newline at end of file
+ path: spatialbench-cli/dist
diff --git a/.github/workflows/packaging.yml b/.github/workflows/packaging.yml
index afce8ef..eef3545 100644
--- a/.github/workflows/packaging.yml
+++ b/.github/workflows/packaging.yml
@@ -105,4 +105,4 @@ jobs:
if: success() && github.repository == 'apache/sedona-spatialbench'
run: |
cd pages-clone
- git push
\ No newline at end of file
+ git push
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000..54c0ad8
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: pre-commit
+
+on: [pull_request]
+
+permissions:
+ contents: read
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: ${{ github.ref != 'refs/heads/master' }}
+
+jobs:
+ pre-commit:
+ name: Run pre-commit # https://pre-commit.com/
+ runs-on: ubuntu-latest
+ steps:
+ - name: 'Checkout ${{ github.ref }} ( ${{ github.sha }} )'
+ uses: actions/checkout@v6
+ with:
+ persist-credentials: false
+ - uses: actions/setup-python@v6 # https://www.python.org/
+ with:
+ python-version: '3.13' # Version range or exact version of a Python
version to use, using SemVer's version range syntax
+ architecture: 'x64' # optional x64 or x86. Defaults to x64 if not
specified
+ - name: Install dependencies # https://pip.pypa.io/en/stable/
+ run: |
+ python -m pip install --upgrade pip
+ pip install pre-commit
+ - name: set PY
+ run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
+ - uses: actions/cache@v5
+ with:
+ path: ~/.cache/pre-commit
+ key: pre-commit|${{ env.PY }}|${{
hashFiles('.pre-commit-config.yaml') }}
+ - name: Run pre-commit
+ run: pre-commit run --color=always --all-files
diff --git a/.github/workflows/spatialbench-cli-publish-pypi.yml
b/.github/workflows/spatialbench-cli-publish-pypi.yml
index 679da96..52e9ac1 100644
--- a/.github/workflows/spatialbench-cli-publish-pypi.yml
+++ b/.github/workflows/spatialbench-cli-publish-pypi.yml
@@ -45,4 +45,4 @@ jobs:
uses: PyO3/maturin-action@v1
with:
command: upload
- args: --non-interactive --skip-existing dist/*
\ No newline at end of file
+ args: --non-interactive --skip-existing dist/*
diff --git a/.github/workflows/spatialbench-publish-crates.yml
b/.github/workflows/spatialbench-publish-crates.yml
index fe9221e..0ca8f77 100644
--- a/.github/workflows/spatialbench-publish-crates.yml
+++ b/.github/workflows/spatialbench-publish-crates.yml
@@ -30,4 +30,4 @@ jobs:
working-directory: ${{ matrix.package }}
run: cargo publish
env:
- CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
\ No newline at end of file
+ CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
diff --git a/.gitignore b/.gitignore
index 0041ccf..3c269b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,4 @@ __old/
Cargo.lock
.idea
.venv/
-site/
\ No newline at end of file
+site/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..02becc2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+---
+repos:
+ - repo: meta
+ hooks:
+ - id: identity
+ name: run identity check
+ description: check you have set your git identity
+ - id: check-hooks-apply
+ name: run check hooks apply
+ description: check that all the hooks apply to the repository
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v6.0.0
+ hooks:
+ - id: check-ast
+ name: run check-ast
+ description: check Python files for syntax errors
+ - id: check-builtin-literals
+ name: run check-builtin-literals
+ description: check Python files for proper use of built-in literals
+ - id: check-case-conflict
+ name: run check-case-conflict
+ description: check for case conflicts in file names
+ - id: check-docstring-first
+ name: run check-docstring-first
+ description: check that docstrings are at the start of functions
+ - id: check-executables-have-shebangs
+ name: run check-executables-have-shebangs
+ description: check that executable scripts have shebang lines
+ - id: check-illegal-windows-names
+ name: run check-illegal-windows-names
+ description: check for Windows-illegal file names
+ - id: check-json
+ name: run check-json
+ description: check JSON files for syntax errors
+ - id: check-merge-conflict
+ name: run check-merge-conflict
+ description: check for merge conflict markers
+ - id: check-shebang-scripts-are-executable
+ name: run check-shebang-scripts-are-executable
+ description: check that scripts with shebangs are executable
+ - id: check-toml
+ name: run check-toml
+ description: check TOML files for syntax errors
+ - id: check-vcs-permalinks
+ name: run check-vcs-permalinks
+ description: ensures that links to vcs websites are permalinks
+ - id: check-xml
+ name: run check-xml
+ description: attempts to load all xml files to verify syntax
+ - id: check-yaml
+ name: run check-yaml
+ description: attempts to load all yaml files to verify syntax
+ - id: debug-statements
+ name: run debug-statements
+ description: check for debugger imports and py37+ `breakpoint()` calls
in python source.
+ - id: destroyed-symlinks
+ name: run destroyed-symlinks
+ description: detects symlinks which are changed to regular files with
a content of a path which that symlink was pointing to
+ - id: detect-aws-credentials
+ name: run detect-aws-credentials
+ description: checks for the existence of AWS secrets that you have set
up with the AWS CLI
+ args: [--allow-missing-credentials]
+ - id: detect-private-key
+ name: run detect-private-key
+ description: checks for the existence of private keys
+ - id: end-of-file-fixer
+ name: run end-of-file-fixer
+ description: makes sure files end in a newline and only a newline
+ - id: fix-byte-order-marker
+ name: run fix-byte-order-marker
+ description: removes UTF-8 byte order marker
+ - id: forbid-submodules
+ name: run forbid-submodules
+ description: forbids any submodules in the repository
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b5b1b28..7c85c85 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,4 +19,4 @@
# CONTRIBUTING.md
-See the [contributors-guide.md](docs/contributors-guide.md)
\ No newline at end of file
+See the [contributors-guide.md](docs/contributors-guide.md)
diff --git a/LICENSE b/LICENSE
index f49a4e1..261eeb9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -198,4 +198,4 @@
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
- limitations under the License.
\ No newline at end of file
+ limitations under the License.
diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py
old mode 100644
new mode 100755
index fca3c8e..e05e237
--- a/benchmark/run_benchmark.py
+++ b/benchmark/run_benchmark.py
@@ -65,7 +65,7 @@ class BenchmarkSuite:
total_time: float = 0.0
timestamp: str = field(default_factory=lambda:
datetime.now(timezone.utc).isoformat())
version: str = "unknown"
-
+
def to_dict(self) -> dict[str, Any]:
return {
"engine": self.engine,
@@ -99,7 +99,7 @@ def _run_query_in_process(
query_sql: str | None,
):
"""Worker function to run a query in a separate process.
-
+
This allows us to forcefully terminate queries that hang or consume
too much memory, which SIGALRM cannot do for native code.
"""
@@ -107,7 +107,7 @@ def _run_query_in_process(
# For Spatial Polars, ensure the package is imported first to register
namespace
if engine_class.__name__ == "SpatialPolarsBenchmark":
import spatial_polars as _sp # noqa: F401
-
+
benchmark = engine_class(data_paths)
benchmark.setup()
try:
@@ -133,17 +133,17 @@ def _run_query_in_process(
def get_data_paths(data_dir: str) -> dict[str, str]:
"""Get paths to all data tables.
-
+
Supports two data formats:
1. Directory format: table_name/*.parquet (e.g.,
building/building.1.parquet)
2. Single file format: table_name.parquet (e.g., building.parquet)
-
+
Returns directory paths for directories containing parquet files.
Both DuckDB, pandas, and SedonaDB can read all parquet files from a
directory.
"""
data_path = Path(data_dir)
paths = {}
-
+
for table in TABLES:
table_path = data_path / table
# Check for directory format first (from HF:
building/building.1.parquet)
@@ -163,32 +163,32 @@ def get_data_paths(data_dir: str) -> dict[str, str]:
matches = list(data_path.glob(f"{table}*.parquet"))
if matches:
paths[table] = str(matches[0])
-
+
return paths
class BaseBenchmark(ABC):
"""Base class for benchmark runners."""
-
+
def __init__(self, data_paths: dict[str, str], engine_name: str):
self.data_paths = data_paths
self.engine_name = engine_name
-
+
@abstractmethod
def setup(self) -> None:
"""Initialize the benchmark environment."""
pass
-
+
@abstractmethod
def teardown(self) -> None:
"""Cleanup the benchmark environment."""
pass
-
+
@abstractmethod
def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
"""Execute a query and return (row_count, result)."""
pass
-
+
def run_query(self, query_name: str, query: str | None = None, timeout:
int = 1200) -> BenchmarkResult:
"""Run a single query with timeout handling."""
start_time = time.perf_counter()
@@ -238,11 +238,11 @@ class BaseBenchmark(ABC):
class DuckDBBenchmark(BaseBenchmark):
"""DuckDB benchmark runner."""
-
+
def __init__(self, data_paths: dict[str, str]):
super().__init__(data_paths, "duckdb")
self._conn = None
-
+
def setup(self) -> None:
import duckdb
self._conn = duckdb.connect()
@@ -254,12 +254,12 @@ class DuckDBBenchmark(BaseBenchmark):
if Path(path).is_dir():
parquet_path = str(Path(path) / "*.parquet")
self._conn.execute(f"CREATE VIEW {table} AS SELECT * FROM
read_parquet('{parquet_path}')")
-
+
def teardown(self) -> None:
if self._conn:
self._conn.close()
self._conn = None
-
+
def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
result = self._conn.execute(query).fetchall()
return len(result), result
@@ -267,11 +267,11 @@ class DuckDBBenchmark(BaseBenchmark):
class GeoPandasBenchmark(BaseBenchmark):
"""GeoPandas benchmark runner."""
-
+
def __init__(self, data_paths: dict[str, str]):
super().__init__(data_paths, "geopandas")
self._queries = None
-
+
def setup(self) -> None:
import importlib.util
geopandas_path = Path(__file__).parent.parent / "spatialbench-queries"
/ "geopandas_queries.py"
@@ -279,10 +279,10 @@ class GeoPandasBenchmark(BaseBenchmark):
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
self._queries = {f"q{i}": getattr(module, f"q{i}") for i in range(1,
QUERY_COUNT + 1)}
-
+
def teardown(self) -> None:
self._queries = None
-
+
def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
if query_name not in self._queries:
raise ValueError(f"Query {query_name} not found")
@@ -292,11 +292,11 @@ class GeoPandasBenchmark(BaseBenchmark):
class SedonaDBBenchmark(BaseBenchmark):
"""SedonaDB benchmark runner."""
-
+
def __init__(self, data_paths: dict[str, str]):
super().__init__(data_paths, "sedonadb")
self._sedona = None
-
+
def setup(self) -> None:
import sedonadb
self._sedona = sedonadb.connect()
@@ -306,10 +306,10 @@ class SedonaDBBenchmark(BaseBenchmark):
if Path(path).is_dir():
parquet_path = str(Path(path) / "*.parquet")
self._sedona.read_parquet(parquet_path).to_view(table,
overwrite=True)
-
+
def teardown(self) -> None:
self._sedona = None
-
+
def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
result = self._sedona.sql(query).to_pandas()
return len(result), result
@@ -317,15 +317,15 @@ class SedonaDBBenchmark(BaseBenchmark):
class SpatialPolarsBenchmark(BaseBenchmark):
"""Spatial Polars benchmark runner."""
-
+
def __init__(self, data_paths: dict[str, str]):
super().__init__(data_paths, "spatial_polars")
self._queries = None
-
+
def setup(self) -> None:
# spatial_polars package is already imported in _run_query_in_process
# to register .spatial namespace before any module loading
-
+
# Load query functions directly from the module
import importlib.util
query_file = Path(__file__).parent.parent / "spatialbench-queries" /
"spatial_polars.py"
@@ -333,10 +333,10 @@ class SpatialPolarsBenchmark(BaseBenchmark):
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
self._queries = {f"q{i}": getattr(module, f"q{i}") for i in range(1,
QUERY_COUNT + 1)}
-
+
def teardown(self) -> None:
self._queries = None
-
+
def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
if query_name not in self._queries:
raise ValueError(f"Query {query_name} not found")
@@ -347,7 +347,7 @@ class SpatialPolarsBenchmark(BaseBenchmark):
def get_sql_queries(dialect: str) -> dict[str, str]:
"""Get SQL queries for a specific dialect from print_queries.py."""
from print_queries import DuckDBSpatialBenchBenchmark,
SedonaDBSpatialBenchBenchmark
-
+
dialects = {
"duckdb": DuckDBSpatialBenchBenchmark,
"sedonadb": SedonaDBSpatialBenchBenchmark,
@@ -364,7 +364,7 @@ def run_query_isolated(
timeout: int,
) -> BenchmarkResult:
"""Run a single query in an isolated subprocess with hard timeout.
-
+
This is more robust than SIGALRM because:
1. Native code (C++/Rust) can be forcefully terminated
2. Memory-hungry queries don't affect the main process
@@ -375,20 +375,20 @@ def run_query_isolated(
target=_run_query_in_process,
args=(result_queue, engine_class, data_paths, query_name, query_sql),
)
-
+
process.start()
process.join(timeout=timeout)
-
+
if process.is_alive():
# Query exceeded timeout - forcefully terminate
process.terminate()
process.join(timeout=5) # Give it 5 seconds to terminate gracefully
-
+
if process.is_alive():
# Still alive - kill it
process.kill()
process.join(timeout=2)
-
+
return BenchmarkResult(
query=query_name,
engine=engine_name,
@@ -397,7 +397,7 @@ def run_query_isolated(
status="timeout",
error_message=f"Query {query_name} timed out after {timeout}
seconds (process killed)",
)
-
+
# Process completed - get result from queue
try:
result_data = result_queue.get_nowait()
@@ -430,18 +430,18 @@ def run_benchmark(
runs: int = 3,
) -> BenchmarkSuite:
"""Generic benchmark runner for any engine.
-
+
Each query runs in an isolated subprocess to ensure:
- Hard timeout enforcement (process can be killed)
- Memory isolation (one query can't OOM the runner)
- Crash isolation (one query crash doesn't affect others)
-
+
If runs > 1 and the first run succeeds, additional runs are performed
and the average time is reported for fair comparison.
"""
-
+
from importlib.metadata import version as pkg_version
-
+
# Engine configurations
configs = {
"duckdb": {
@@ -465,30 +465,30 @@ def run_benchmark(
"queries_getter": lambda: {f"q{i}": None for i in range(1,
QUERY_COUNT + 1)},
},
}
-
+
config = configs[engine]
version = config["version_getter"]()
-
+
# Format engine name for display
display_name = engine.replace("_", " ").title()
-
+
print(f"\n{'=' * 60}")
print(f"Running {display_name} Benchmark")
print(f"{'=' * 60}")
print(f"{display_name} version: {version}")
if runs > 1:
print(f"Runs per query: {runs} (average will be reported)")
-
+
suite = BenchmarkSuite(engine=engine, scale_factor=scale_factor,
version=version)
all_queries = config["queries_getter"]()
engine_class = config["class"]
-
+
for query_name, query_sql in all_queries.items():
if queries and query_name not in queries:
continue
-
+
print(f" Running {query_name}...", end=" ", flush=True)
-
+
# First run
result = run_query_isolated(
engine_class=engine_class,
@@ -498,11 +498,11 @@ def run_benchmark(
query_sql=query_sql,
timeout=timeout,
)
-
+
# If first run succeeded and we want multiple runs, do additional runs
if result.status == "success" and runs > 1:
run_times = [result.time_seconds]
-
+
for run_num in range(2, runs + 1):
additional_result = run_query_isolated(
engine_class=engine_class,
@@ -517,7 +517,7 @@ def run_benchmark(
else:
# If any subsequent run fails, just use successful runs
break
-
+
# Calculate average of all successful runs
avg_time = round(sum(run_times) / len(run_times), 2)
result = BenchmarkResult(
@@ -533,11 +533,11 @@ def run_benchmark(
print(f"{result.time_seconds}s ({result.row_count} rows)")
else:
print(f"{result.status.upper()}: {result.error_message}")
-
+
suite.results.append(result)
if result.status == "success":
suite.total_time += result.time_seconds
-
+
return suite
@@ -546,12 +546,12 @@ def print_summary(results: list[BenchmarkSuite]) -> None:
print(f"\n{'=' * 80}")
print("BENCHMARK SUMMARY")
print("=" * 80)
-
+
all_queries = sorted(
{r.query for suite in results for r in suite.results},
key=lambda x: int(x[1:])
)
-
+
data = {
suite.engine: {
r.query: f"{r.time_seconds:.2f}s" if r.status == "success" else
r.status.upper()
@@ -559,16 +559,16 @@ def print_summary(results: list[BenchmarkSuite]) -> None:
}
for suite in results
}
-
+
engines = [s.engine for s in results]
header = f"{'Query':<10}" + "".join(f"{e:<15}" for e in engines)
print(header)
print("-" * len(header))
-
+
for query in all_queries:
row = f"{query:<10}" + "".join(f"{data.get(e, {}).get(query,
'N/A'):<15}" for e in engines)
print(row)
-
+
print("-" * len(header))
print(f"{'Total':<10}" + "".join(f"{s.total_time:.2f}s{'':<9}" for s in
results))
@@ -581,10 +581,10 @@ def save_results(results: list[BenchmarkSuite],
output_file: str) -> None:
"generated_at": datetime.now(timezone.utc).isoformat(),
"results": [suite.to_dict() for suite in results],
}
-
+
with open(output_file, "w") as f:
json.dump(output, f, indent=2)
-
+
print(f"\nResults saved to {output_file}")
@@ -606,33 +606,33 @@ def main():
help="Output file for results")
parser.add_argument("--scale-factor", type=float, default=1,
help="Scale factor of the data (for reporting only)")
-
+
args = parser.parse_args()
-
+
engines = [e.strip().lower() for e in args.engines.split(",")]
valid_engines = {"duckdb", "geopandas", "sedonadb", "spatial_polars"}
-
+
for e in engines:
if e not in valid_engines:
print(f"Error: Unknown engine '{e}'. Valid options:
{valid_engines}")
sys.exit(1)
-
+
queries = [q.strip().lower() for q in args.queries.split(",")] if
args.queries else None
-
+
data_paths = get_data_paths(args.data_dir)
if not data_paths:
print(f"Error: No data files found in {args.data_dir}")
sys.exit(1)
-
+
print("Data paths:")
for table, path in data_paths.items():
print(f" {table}: {path}")
-
+
results = [
run_benchmark(engine, data_paths, queries, args.timeout,
args.scale_factor, args.runs)
for engine in engines
]
-
+
print_summary(results)
save_results(results, args.output)
diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py
old mode 100644
new mode 100755
index 5c08707..a52fcc8
--- a/benchmark/summarize_results.py
+++ b/benchmark/summarize_results.py
@@ -30,14 +30,14 @@ def load_results(results_dir: str) -> dict:
"""Load all JSON result files from a directory."""
results = {}
results_path = Path(results_dir)
-
+
for json_file in results_path.glob("*_results.json"):
with open(json_file) as f:
data = json.load(f)
for suite in data.get("results", []):
engine = suite["engine"]
results[engine] = suite
-
+
return results
@@ -57,7 +57,7 @@ def get_winner(query: str, data: dict, engines: list) -> str
| None:
result = data.get(engine, {}).get(query, {})
if result.get("status") == "success" and result.get("time_seconds") is
not None:
times[engine] = result["time_seconds"]
-
+
if not times:
return None
return min(times, key=times.get)
@@ -66,34 +66,34 @@ def get_winner(query: str, data: dict, engines: list) ->
str | None:
def generate_markdown_summary(results: dict, output_file: str, query_timeout:
int | None = None, runs: int | None = None) -> str:
"""Generate a markdown summary of benchmark results for GitHub Actions."""
engines = sorted(results.keys())
-
+
if not engines:
markdown = "# š SpatialBench Benchmark Results\n\nā ļø No results found."
with open(output_file, "w") as f:
f.write(markdown)
return markdown
-
+
# Get scale factor from first result
scale_factor = results[engines[0]].get("scale_factor", 1)
timestamp = results[engines[0]].get("timestamp",
datetime.now(timezone.utc).isoformat())
-
+
# Collect all queries
all_queries = set()
for engine_data in results.values():
for r in engine_data.get("results", []):
all_queries.add(r["query"])
all_queries = sorted(all_queries, key=lambda x: int(x[1:]))
-
+
# Build result lookup
data = {}
for engine, engine_data in results.items():
data[engine] = {}
for r in engine_data.get("results", []):
data[engine][r["query"]] = r
-
+
# Get version info
versions = {engine: results[engine].get("version", "unknown") for engine
in engines}
-
+
# Engine display names with icons
engine_icons = {
"sedonadb": "šµ SedonaDB",
@@ -101,7 +101,7 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"geopandas": "š¼ GeoPandas",
"spatial_polars": "š»āāļø Spatial Polars",
}
-
+
# Generate markdown
lines = [
"# š SpatialBench Benchmark Results",
@@ -119,11 +119,11 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"| Engine | Version |",
"|--------|---------|",
]
-
+
for engine in engines:
icon_name = engine_icons.get(engine, engine.title())
lines.append(f"| {icon_name} | `{versions[engine]}` |")
-
+
# Main results table
lines.extend([
"",
@@ -132,7 +132,7 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"| Query | " + " | ".join(engine_icons.get(e, e.title()) for e in
engines) + " |",
"|:------|" + "|".join(":---:" for _ in engines) + "|",
])
-
+
# Add rows for each query with winner highlighting
for query in all_queries:
winner = get_winner(query, data, engines)
@@ -154,14 +154,14 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
else:
row += " ā |"
lines.append(row)
-
+
# Win count summary
win_counts = {engine: 0 for engine in engines}
for query in all_queries:
winner = get_winner(query, data, engines)
if winner:
win_counts[winner] += 1
-
+
lines.extend([
"",
"## š„ Performance Summary",
@@ -169,19 +169,19 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"| Engine | Wins |",
"|--------|:----:|",
])
-
+
for engine in sorted(engines, key=lambda e: win_counts[e], reverse=True):
icon_name = engine_icons.get(engine, engine.title())
wins = win_counts[engine]
lines.append(f"| {icon_name} | {wins} |")
-
+
# Detailed results section (collapsible)
lines.extend([
"",
"## š Detailed Results",
"",
])
-
+
for engine in engines:
icon_name = engine_icons.get(engine, engine.title())
lines.extend([
@@ -191,32 +191,32 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"| Query | Time | Status | Rows |",
"|:------|-----:|:------:|-----:|",
])
-
+
for query in all_queries:
result = data.get(engine, {}).get(query, {})
time_str = format_time(result.get("time_seconds"))
status = result.get("status", "N/A")
rows = result.get("row_count")
row_str = f"{rows:,}" if rows is not None else "ā"
-
+
status_emoji = {
"success": "ā
",
"error": "ā",
"timeout": "ā±ļø",
}.get(status, "ā")
-
+
lines.append(f"| {query.upper()} | {time_str} | {status_emoji} |
{row_str} |")
-
+
lines.extend([
"",
"</details>",
"",
])
-
+
# Add error details if any
has_errors = False
error_lines = ["## ā ļø Errors and Timeouts", ""]
-
+
for engine in engines:
engine_errors = []
for query in all_queries:
@@ -227,7 +227,7 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
if len(error_msg) > 200:
error_msg = error_msg[:200] + "..."
engine_errors.append(f"- **{query.upper()}**: `{error_msg}`")
-
+
if engine_errors:
has_errors = True
icon_name = engine_icons.get(engine, engine.title())
@@ -235,10 +235,10 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
error_lines.append("")
error_lines.extend(engine_errors)
error_lines.append("")
-
+
if has_errors:
lines.extend(error_lines)
-
+
# Footer
lines.extend([
"---",
@@ -251,13 +251,13 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"",
f"*Generated by
[SpatialBench](https://github.com/apache/sedona-spatialbench) on
{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}*",
])
-
+
markdown = "\n".join(lines)
-
+
# Write to file
with open(output_file, "w") as f:
f.write(markdown)
-
+
return markdown
@@ -289,18 +289,18 @@ def main():
default=3,
help="Number of runs per query (for reporting)",
)
-
+
args = parser.parse_args()
-
+
results = load_results(args.results_dir)
-
+
if not results:
print(f"No results found in {args.results_dir}")
# Write empty summary
with open(args.output, "w") as f:
f.write("# SpatialBench Benchmark Results\n\nNo results found.")
return
-
+
markdown = generate_markdown_summary(results, args.output, args.timeout,
args.runs)
print(f"Summary written to {args.output}")
print("\nPreview:")
diff --git a/dev/release/README.md b/dev/release/README.md
index 01db51e..7e3b58e 100644
--- a/dev/release/README.md
+++ b/dev/release/README.md
@@ -202,4 +202,4 @@ To publish, run this command:
```shell
cargo publish
-```
\ No newline at end of file
+```
diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py
index b9fd85a..88c900c 100644
--- a/dev/release/check-rat-report.py
+++ b/dev/release/check-rat-report.py
@@ -56,4 +56,3 @@ if not all_ok:
print("OK")
sys.exit(0)
-
diff --git a/dev/release/run-rat.sh b/dev/release/run-rat.sh
index d5230c7..4f7d791 100755
--- a/dev/release/run-rat.sh
+++ b/dev/release/run-rat.sh
@@ -42,5 +42,3 @@ else
echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt"
exit 1
fi
-
-
diff --git a/dev/release/verify-release-candidate.sh
b/dev/release/verify-release-candidate.sh
index 4589891..1c49380 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -279,4 +279,3 @@ TEST_SUCCESS=yes
echo "Release candidate ${VERSION}-RC${RC_NUMBER} looks good!"
exit 0
-
diff --git a/docs/contributors-guide.md b/docs/contributors-guide.md
index 5f33ccf..df8d609 100644
--- a/docs/contributors-guide.md
+++ b/docs/contributors-guide.md
@@ -175,4 +175,4 @@ To contribute to the SpatialBench documentation:
* `mkdocs serve` - Start the live-reloading docs server.
* `mkdocs build` - Build the documentation site.
* `mkdocs -h` - Print help message and exit.
-5. Push your changes and open a pull request.
\ No newline at end of file
+5. Push your changes and open a pull request.
diff --git a/docs/javascripts/katex.js b/docs/javascripts/katex.js
index 5abd19e..ec73954 100644
--- a/docs/javascripts/katex.js
+++ b/docs/javascripts/katex.js
@@ -18,4 +18,4 @@
if (window.document$) {
document$.subscribe(renderAll);
}
-})();
\ No newline at end of file
+})();
diff --git a/docs/queries.md b/docs/queries.md
index 7cc4741..5edfd4a 100644
--- a/docs/queries.md
+++ b/docs/queries.md
@@ -609,4 +609,3 @@ ORDER BY t.t_tripkey ASC, distance_to_building ASC,
b.b_buildingkey ASC
āāāāāāāāāāāāā¼āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā¼āāāāāāāāāāāāāāāā¼āāāāāāāāāāāāāāāā¼āāāāāāāāāāāāāāāāāāāāā¤
ā 1 ā 01010000009f3c318dd43735405930⦠ā 8384 ā lavender
ā 1.4195012994942622 ā
āāāāāāāāāāāāā“āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā“āāāāāāāāāāāāāāāā“āāāāāāāāāāāāāāāā“āāāāāāāāāāāāāāāāāāāāā
-
diff --git a/docs/requirements.txt b/docs/requirements.txt
index e90aba0..01da27b 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -12,7 +12,3 @@ notebook
nbconvert
pyproj
ruff
-
-
-
-
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
index 644bdc4..32a30f6 100644
--- a/docs/stylesheets/extra.css
+++ b/docs/stylesheets/extra.css
@@ -105,4 +105,4 @@
/* Change the mobile nav header label text to white */
label.md-nav__title {
color: #FFFFFF !important;
-}
\ No newline at end of file
+}
diff --git a/raster/generator.py b/raster/generator.py
index 0e5d70b..5c11ab2 100644
--- a/raster/generator.py
+++ b/raster/generator.py
@@ -85,4 +85,4 @@ def create_multiband_landsat_like_cog(
print(f"Saved: {filename}")
# Example usage:
-create_multiband_landsat_like_cog("output/synthetic_landsat_multiband.tif")
\ No newline at end of file
+create_multiband_landsat_like_cog("output/synthetic_landsat_multiband.tif")
diff --git a/spatialbench-arrow/README.md b/spatialbench-arrow/README.md
index 95147d8..0d902de 100644
--- a/spatialbench-arrow/README.md
+++ b/spatialbench-arrow/README.md
@@ -39,4 +39,4 @@ This crate ensures correct results using two methods.
Please see [CONTRIBUTING.md] for more information on how to contribute to this
project.
-[CONTRIBUTING.md]:
https://github.com/apache/sedona-spatialbench/blob/main/CONTRIBUTING.md
\ No newline at end of file
+[CONTRIBUTING.md]:
https://github.com/apache/sedona-spatialbench/blob/main/CONTRIBUTING.md
diff --git a/spatialbench-config.yml b/spatialbench-config.yml
index 86ce5a8..bc93bae 100644
--- a/spatialbench-config.yml
+++ b/spatialbench-config.yml
@@ -42,4 +42,4 @@ building:
pareto_alpha_city: 1.20
pareto_xm_city: 1.0
pareto_alpha_sub: 1.00
- pareto_xm_sub: 1.0
\ No newline at end of file
+ pareto_xm_sub: 1.0
diff --git a/spatialbench-queries/print_queries.py
b/spatialbench-queries/print_queries.py
index 6d4777f..90573f6 100755
--- a/spatialbench-queries/print_queries.py
+++ b/spatialbench-queries/print_queries.py
@@ -451,4 +451,4 @@ def main():
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff --git a/spatialbench/data/README.md b/spatialbench/data/README.md
index 10306f1..7bd938f 100644
--- a/spatialbench/data/README.md
+++ b/spatialbench/data/README.md
@@ -132,4 +132,4 @@ And then compare with `diff`
```shell
diff -du /tmp/customer.c.tbl /tmp/customer.java.tbl
-```
\ No newline at end of file
+```