This is an automated email from the ASF dual-hosted git repository.
prantogg pushed a commit to branch support-multipart
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
The following commit(s) were added to refs/heads/support-multipart by this push:
new e4ee328 update generate_data.py script
e4ee328 is described below
commit e4ee328be935cec158fff809bfd385ed6495c5d1
Author: Pranav Toggi <[email protected]>
AuthorDate: Sun Oct 26 00:42:47 2025 -0700
update generate_data.py script
---
README.md | 13 ++-----------
tools/generate_data.py | 30 +++---------------------------
2 files changed, 5 insertions(+), 38 deletions(-)
diff --git a/README.md b/README.md
index 3ee105e..2ac7140 100644
--- a/README.md
+++ b/README.md
@@ -95,13 +95,7 @@ Key performance benefits:
SpatialBench is a Rust-based fork of the tpchgen-rs project. It preserves the
original’s high-performance, multi-threaded, streaming architecture, while
extending it with a spatial star schema and geometry generation logic.
-You can build the SpatialBench data generator using Cargo:
-
-```bash
-cargo build --release
-```
-
-Alternatively, install it directly using:
+You can install the SpatialBench data generator using Cargo:
```bash
cargo install --path ./spatialbench-cli
@@ -133,10 +127,7 @@ spatialbench-cli -s 1 --format=parquet --tables
trip,building --output-dir sf1-p
#### Partitioned Output Example
```bash
-for PART in $(seq 1 4); do
- mkdir part-$PART
- spatialbench-cli -s 10 --tables trip,building --output-dir part-$PART
--parts 4 --part $PART
-done
+spatialbench-cli -s 10 --tables trip,building --parts 4
```
#### Generate Multiple Parquet Files of Similar Size
diff --git a/tools/generate_data.py b/tools/generate_data.py
index 34efb0b..2308603 100755
--- a/tools/generate_data.py
+++ b/tools/generate_data.py
@@ -103,12 +103,8 @@ def _generate_data(scale_factor: int, num_partitions:
dict[str, int], output_pat
tables = list(num_partitions.keys())
# Ensure base directories exist
Path(output_path).mkdir(parents=True, exist_ok=True)
- (Path(output_path) / "staging").mkdir(parents=True, exist_ok=True)
- def run_one(table: str, part: int) -> None:
- # Use a per-table, per-part staging dir to avoid collisions when
parallel
- staging_dir = Path(output_path) / "staging" / table /
f"part-{part}"
- staging_dir.mkdir(parents=True, exist_ok=True)
+ def run_one(table: str) -> None:
result = subprocess.run(
[
@@ -118,8 +114,7 @@ def _generate_data(scale_factor: int, num_partitions:
dict[str, int], output_pat
f"--format=parquet",
f"--parts={num_partitions[table]}",
f"--tables={table}",
- f"--part={part}",
- f"--output-dir={staging_dir}",
+ f"--output-dir={output_path}",
],
capture_output=True,
text=True,
@@ -129,32 +124,13 @@ def _generate_data(scale_factor: int, num_partitions:
dict[str, int], output_pat
logging.warning("Command errors:")
logging.warning(result.stderr)
- # Collate results by table instead of part
- dest_dir = Path(output_path) / table
- dest_dir.mkdir(parents=True, exist_ok=True)
- src_file = staging_dir / f"{table}.parquet"
- dest_file = dest_dir / f"part-{part}.parquet"
- shutil.move(str(src_file), str(dest_file))
-
- # Cleanup staging for this (table, part)
- try:
- shutil.rmtree(staging_dir)
- # remove parent if empty
- parent = staging_dir.parent
- if parent.exists() and not any(parent.iterdir()):
- parent.rmdir()
- except Exception as cleanup_err:
- logging.debug(f"Cleanup warning for {staging_dir}:
{cleanup_err}")
-
# Launch all generation tasks in parallel threads
futures = []
with concurrent.futures.ThreadPoolExecutor(
max_workers=os.cpu_count() or 4
) as executor:
for table in tables:
- for idx in range(num_partitions[table]):
- part = idx + 1 # 1-indexed
- futures.append(executor.submit(run_one, table, part))
+ futures.append(executor.submit(run_one, table))
# Raise the first exception if any
for fut in concurrent.futures.as_completed(futures):
fut.result()