This is an automated email from the ASF dual-hosted git repository.
kontinuation pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new e6cd9c3 Run queries in python benchmarks using only one thread (#24)
e6cd9c3 is described below
commit e6cd9c391b2ed6a50464abb17a972f8d4ce8ed9d
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Thu Oct 9 14:03:47 2025 +0800
Run queries in python benchmarks using only one thread (#24)
This PR adds single-threaded variants for engines for running unit-test
style UDF tests using only one thread. This makes the performance benchmark
results of UDF functions comparable across different engines.
---
benchmarks/README.md | 31 ++++++++++++++++++++
benchmarks/test_bench_base.py | 23 ++++++++++++++-
benchmarks/test_distance.py | 10 +++++--
benchmarks/test_functions.py | 46 +++++++++++++++++++++++-------
benchmarks/test_overlay.py | 10 +++++--
benchmarks/test_predicates.py | 14 +++++++--
python/sedonadb/python/sedonadb/testing.py | 26 +++++++++++++++++
7 files changed, 141 insertions(+), 19 deletions(-)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index c84b0ab..c0e094e 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -79,3 +79,34 @@ test_st_buffer[collections_simple-PostGIS] 855.3329
(9.96) 854.7194 (9.
```
For more details and command line options, refer to the official
[pytest-benchmark
documentation](https://pytest-benchmark.readthedocs.io/en/latest/usage.html)
+
+### Adding New Benchmarks
+
+There are two types of engines, each type serving a different purpose:
+
+- `SedonaDBSingleThread`, `DuckDBSingleThread`, `PostGISSingleThread`:
+ Micro / UDF benchmarks that measure the per-function cost (e.g. ST_Area,
ST_Contains). These should run engines in a comparable, single-thread style
configuration (where possible) to make function-level performance differences
clearer.
+- `SedonaDB`, `DuckDB`, `PostGIS`:
+ Macro / complex query benchmarks (e.g. KNN joins) that represent perceived
end-user performance. Engines run with their default / natural configuration
(multi-threading, internal parallelism, etc.).
+
+Please choose the appropriate engines when adding a new benchmark. All
existing benchmarks have been annotated accordingly.
+
+Example (UDF micro benchmark in single-thread mode):
+```python
+import pytest
+from sedonadb.testing import SedonaDBSingleThread, DuckDBSingleThread,
PostGISSingleThread
+
[email protected]("eng", [SedonaDBSingleThread, PostGISSingleThread,
DuckDBSingleThread])
+def test_st_area(benchmark, eng):
+ ...
+```
+
+Example (Query / macro benchmark in default mode):
+```python
+import pytest
+from sedonadb.testing import SedonaDB, DuckDB, PostGIS
+
[email protected]("eng", [SedonaDB, PostGIS, DuckDB])
+def test_knn_performance(benchmark, eng):
+ ...
+```
diff --git a/benchmarks/test_bench_base.py b/benchmarks/test_bench_base.py
index 8f01274..b1da22b 100644
--- a/benchmarks/test_bench_base.py
+++ b/benchmarks/test_bench_base.py
@@ -15,7 +15,14 @@
# specific language governing permissions and limitations
# under the License.
import json
-from sedonadb.testing import DuckDB, PostGIS, SedonaDB
+from sedonadb.testing import (
+ DuckDB,
+ PostGIS,
+ SedonaDB,
+ DuckDBSingleThread,
+ PostGISSingleThread,
+ SedonaDBSingleThread,
+)
class TestBenchBase:
@@ -23,6 +30,10 @@ class TestBenchBase:
self.sedonadb = SedonaDB.create_or_skip()
self.postgis = PostGIS.create_or_skip()
self.duckdb = DuckDB.create_or_skip()
+ # Single-thread engine instances
+ self.sedonadb_single = SedonaDBSingleThread.create_or_skip()
+ self.postgis_single = PostGISSingleThread.create_or_skip()
+ self.duckdb_single = DuckDBSingleThread.create_or_skip()
num_geoms = 100_000
@@ -128,6 +139,10 @@ class TestBenchBase:
self.sedonadb.create_table_arrow(name, tab)
self.postgis.create_table_arrow(name, tab)
self.duckdb.create_table_arrow(name, tab)
+ self.sedonadb_single.create_table_arrow(name, tab)
+ self.duckdb_single.create_table_arrow(name, tab)
+ # We don't need to call self.postgis_single.create_table_arrow
+ # because it shares the same database with self.postgis
def _get_eng(self, eng):
if eng == SedonaDB:
@@ -136,5 +151,11 @@ class TestBenchBase:
return self.postgis
elif eng == DuckDB:
return self.duckdb
+ elif eng == SedonaDBSingleThread:
+ return self.sedonadb_single
+ elif eng == PostGISSingleThread:
+ return self.postgis_single
+ elif eng == DuckDBSingleThread:
+ return self.duckdb_single
else:
raise ValueError(f"Unsupported engine: {eng}")
diff --git a/benchmarks/test_distance.py b/benchmarks/test_distance.py
index 5744c01..bc6ed97 100644
--- a/benchmarks/test_distance.py
+++ b/benchmarks/test_distance.py
@@ -16,11 +16,17 @@
# under the License.
import pytest
from test_bench_base import TestBenchBase
-from sedonadb.testing import DuckDB, PostGIS, SedonaDB
+from sedonadb.testing import (
+ DuckDBSingleThread,
+ PostGISSingleThread,
+ SedonaDBSingleThread,
+)
class TestBenchPredicates(TestBenchBase):
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
diff --git a/benchmarks/test_functions.py b/benchmarks/test_functions.py
index 01d34e1..d8ec008 100644
--- a/benchmarks/test_functions.py
+++ b/benchmarks/test_functions.py
@@ -16,11 +16,17 @@
# under the License.
import pytest
from test_bench_base import TestBenchBase
-from sedonadb.testing import DuckDB, PostGIS, SedonaDB
+from sedonadb.testing import (
+ DuckDBSingleThread,
+ SedonaDBSingleThread,
+ PostGISSingleThread,
+)
class TestBenchFunctions(TestBenchBase):
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -36,7 +42,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -51,7 +59,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -67,7 +77,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -83,7 +95,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -99,7 +113,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -115,7 +131,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -131,7 +149,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -147,7 +167,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -164,7 +186,9 @@ class TestBenchFunctions(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
diff --git a/benchmarks/test_overlay.py b/benchmarks/test_overlay.py
index cc8a513..cd9121f 100644
--- a/benchmarks/test_overlay.py
+++ b/benchmarks/test_overlay.py
@@ -16,11 +16,17 @@
# under the License.
import pytest
from test_bench_base import TestBenchBase
-from sedonadb.testing import DuckDB, PostGIS, SedonaDB
+from sedonadb.testing import (
+ DuckDBSingleThread,
+ PostGISSingleThread,
+ SedonaDBSingleThread,
+)
class TestBenchPredicates(TestBenchBase):
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
diff --git a/benchmarks/test_predicates.py b/benchmarks/test_predicates.py
index cd8b871..9e3a9ec 100644
--- a/benchmarks/test_predicates.py
+++ b/benchmarks/test_predicates.py
@@ -16,11 +16,17 @@
# under the License.
import pytest
from test_bench_base import TestBenchBase
-from sedonadb.testing import DuckDB, PostGIS, SedonaDB
+from sedonadb.testing import (
+ DuckDBSingleThread,
+ PostGISSingleThread,
+ SedonaDBSingleThread,
+)
class TestBenchPredicates(TestBenchBase):
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
@@ -36,7 +42,9 @@ class TestBenchPredicates(TestBenchBase):
benchmark(queries)
- @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
+ @pytest.mark.parametrize(
+ "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
+ )
@pytest.mark.parametrize(
"table",
[
diff --git a/python/sedonadb/python/sedonadb/testing.py
b/python/sedonadb/python/sedonadb/testing.py
index df04907..83533dd 100644
--- a/python/sedonadb/python/sedonadb/testing.py
+++ b/python/sedonadb/python/sedonadb/testing.py
@@ -344,6 +344,15 @@ class SedonaDB(DBEngine):
return self.con.sql(query).to_arrow_table()
+class SedonaDBSingleThread(SedonaDB):
+ """SedonaDB configured for single-threaded execution"""
+
+ def __init__(self):
+ super().__init__()
+ # Force single-threaded execution
+ self.con.sql("SET datafusion.execution.target_partitions TO 1")
+
+
class DuckDB(DBEngine):
"""A DuckDB implementation of the DBEngine using DuckDB Python"""
@@ -395,6 +404,14 @@ class DuckDB(DBEngine):
return self.con.sql(query).fetch_arrow_table()
+class DuckDBSingleThread(DuckDB):
+ """DuckDB configured for single-threaded execution"""
+
+ def __init__(self):
+ super().__init__()
+ self.con.sql("SET threads TO 1")
+
+
class PostGIS(DBEngine):
"""A PostGIS implementation of the DBEngine using ADBC
@@ -598,6 +615,15 @@ class PostGIS(DBEngine):
return col_srid
+class PostGISSingleThread(PostGIS):
+ """PostGIS configured for single-threaded (no parallel workers)
execution"""
+
+ def __init__(self, uri=None):
+ super().__init__(uri)
+ with self.con.cursor() as cur:
+ cur.execute("SET max_parallel_workers_per_gather TO 0")
+
+
def geom_or_null(arg):
"""Format SQL expression for a geometry object or NULL"""
if arg is None: