This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit 5151a72771546f5bdfc343b2e36e2e77708adff5 Author: Pranav Toggi <[email protected]> AuthorDate: Mon Jul 21 17:36:45 2025 -0700 update package names --- .github/workflows/rust.yml | 32 ++++++------ ...-pypi.yml => spatialbench-cli-publish-pypi.yml} | 22 ++++---- ARCHITECTURE.md | 8 +-- Cargo.toml | 6 +-- README.md | 16 +++--- benchmarks/BENCHMARKS.md | 16 +++--- ...{parquet_tpchgen.sh => parquet_spatialbench.sh} | 6 +-- benchmarks/{tbl_tpchgen.sh => tbl_spatialbench.sh} | 6 +-- .../{tbl_tpchgen_1.sh => tbl_spatialbench_1.sh} | 6 +-- {tpchgen-arrow => spatialbench-arrow}/Cargo.toml | 6 +-- {tpchgen-arrow => spatialbench-arrow}/LICENSE | 0 {tpchgen-arrow => spatialbench-arrow}/README.md | 2 +- .../src/building.rs | 8 +-- .../src/conversions.rs | 8 +-- .../src/customer.rs | 8 +-- .../src/driver.rs | 8 +-- spatialbench-arrow/src/lib.rs | 61 ++++++++++++++++++++++ {tpchgen-arrow => spatialbench-arrow}/src/trip.rs | 2 +- .../src/vehicle.rs | 8 +-- {tpchgen-arrow => spatialbench-arrow}/src/zone.rs | 8 +-- .../tests/reparse.rs | 14 ++--- {tpchgen-cli => spatialbench-cli}/Cargo.toml | 6 +-- {tpchgen-cli => spatialbench-cli}/README.md | 16 +++--- {tpchgen-cli => spatialbench-cli}/pyproject.toml | 2 +- {tpchgen-cli => spatialbench-cli}/src/csv.rs | 6 +-- {tpchgen-cli => spatialbench-cli}/src/generate.rs | 2 +- {tpchgen-cli => spatialbench-cli}/src/main.rs | 30 +++++------ {tpchgen-cli => spatialbench-cli}/src/parquet.rs | 2 +- .../src/statistics.rs | 0 {tpchgen-cli => spatialbench-cli}/src/tbl.rs | 4 +- {tpchgen => spatialbench}/Cargo.toml | 4 +- {tpchgen => spatialbench}/data/README.md | 0 {tpchgen => spatialbench}/src/csv.rs | 24 ++++----- {tpchgen => spatialbench}/src/dates.rs | 2 +- {tpchgen => spatialbench}/src/decimal.rs | 2 +- {tpchgen => spatialbench}/src/distribution.rs | 0 {tpchgen => spatialbench}/src/dists.dss | 0 {tpchgen => spatialbench}/src/generators.rs | 0 {tpchgen => spatialbench}/src/kde.rs | 0 {tpchgen => spatialbench}/src/lib.rs | 2 +- {tpchgen => spatialbench}/src/queries.rs | 0 {tpchgen => spatialbench}/src/random.rs | 0 {tpchgen => spatialbench}/src/spider.rs | 0 {tpchgen => spatialbench}/src/spider_presets.rs | 0 {tpchgen => spatialbench}/src/text.rs | 0 .../tests/integration_tests.rs | 0 tpchgen-arrow/src/lib.rs | 60 --------------------- tpchgen-rs-readme.md | 24 ++++----- 48 files changed, 217 insertions(+), 220 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 49ce3ce..ef35d4d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -21,41 +21,41 @@ jobs: - name: Check run: cargo check --verbose --workspace --all-targets - # Tests for tpchgen - test-tests-tpchgen: + # Tests for spatialbench + test-tests-spatialbench: runs-on: ubuntu-latest needs: lint steps: - uses: actions/checkout@v4 - - name: Tests (tpchgen) - run: cargo test -p tpchgen --tests + - name: Tests (spatialbench) + run: cargo test -p spatialbench --tests - # doc examples test for tpchgen (takes a while) - test-doc-tpchgen: + # doc examples test for spatialbench (takes a while) + test-doc-spatialbench: runs-on: ubuntu-latest needs: lint steps: - uses: actions/checkout@v4 - - name: Doc Tests (tpchgen) - run: cargo test -p tpchgen --doc + - name: Doc Tests (spatialbench) + run: cargo test -p spatialbench --doc - # All tests for tpchgen-arrow - test-all-tpchgen-arrow: + # All tests for spatialbench-arrow + test-all-spatialbench-arrow: runs-on: ubuntu-latest needs: lint steps: - uses: actions/checkout@v4 - - name: All Tests (tpchgen-arrow) - run: cargo test -p tpchgen-arrow + - name: All Tests (spatialbench-arrow) + run: cargo test -p spatialbench-arrow - # All tests for tpchgen-cli - test-all-tpchgen-cli: + # All tests for spatialbench-cli + test-all-spatialbench-cli: runs-on: ubuntu-latest needs: lint steps: - uses: actions/checkout@v4 - - name: All Tests (tpchgen-cli) - run: cargo test -p tpchgen-cli + - name: All Tests (spatialbench-cli) + run: cargo test -p spatialbench-cli # documentation build docs: diff --git a/.github/workflows/tpchgen-cli-publish-pypi.yml b/.github/workflows/spatialbench-cli-publish-pypi.yml similarity index 89% rename from .github/workflows/tpchgen-cli-publish-pypi.yml rename to .github/workflows/spatialbench-cli-publish-pypi.yml index 5aed979..c95126e 100644 --- a/.github/workflows/tpchgen-cli-publish-pypi.yml +++ b/.github/workflows/spatialbench-cli-publish-pypi.yml @@ -1,4 +1,4 @@ -name: tpchgen-cli-publish-pypi +name: spatialbench-cli-publish-pypi on: release: @@ -33,13 +33,13 @@ jobs: with: target: ${{ matrix.platform.target }} args: --release --out dist - working-directory: tpchgen-cli + working-directory: spatialbench-cli manylinux: auto - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels-linux-${{ matrix.platform.target }} - path: tpchgen-cli/dist + path: spatialbench-cli/dist musllinux: runs-on: ${{ matrix.platform.runner }} @@ -61,13 +61,13 @@ jobs: with: target: ${{ matrix.platform.target }} args: --release --out dist - working-directory: tpchgen-cli + working-directory: spatialbench-cli manylinux: musllinux_1_2 - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels-musllinux-${{ matrix.platform.target }} - path: tpchgen-cli/dist + path: spatialbench-cli/dist windows: runs-on: ${{ matrix.platform.runner }} @@ -85,12 +85,12 @@ jobs: with: target: ${{ matrix.platform.target }} args: --release --out dist - working-directory: tpchgen-cli + working-directory: spatialbench-cli - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels-windows-${{ matrix.platform.target }} - path: tpchgen-cli/dist + path: spatialbench-cli/dist macos: runs-on: ${{ matrix.platform.runner }} @@ -108,12 +108,12 @@ jobs: with: target: ${{ matrix.platform.target }} args: --release --out dist - working-directory: tpchgen-cli + working-directory: spatialbench-cli - name: Upload wheels uses: actions/upload-artifact@v4 with: name: wheels-macos-${{ matrix.platform.target }} - path: tpchgen-cli/dist + path: spatialbench-cli/dist sdist: runs-on: ubuntu-latest @@ -124,12 +124,12 @@ jobs: with: command: sdist args: --out dist - working-directory: tpchgen-cli + working-directory: spatialbench-cli - name: Upload sdist uses: actions/upload-artifact@v4 with: name: wheels-sdist - path: tpchgen-cli/dist + path: spatialbench-cli/dist release: name: Release diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index f223de3..50c35e3 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -3,16 +3,16 @@ ## Crate Organization The project is organized into two crates: -1. `tpchgen`: The core library that implements the data generation logic for TPCH. -2. `tpchgen-cli`: A CLI tool that uses the `tpchgen` library to generate TPCH data. +1. `spatialbench`: The core library that implements the data generation logic for SpatialBench. +2. `spatialbench-cli`: A CLI tool that uses the `spatialbench` library to generate SpatialBench data. ## Dependencies -The `tpchgen` crate is designed to be embeddable in as many locations as +The `spatialbench` crate is designed to be embeddable in as many locations as possible and thus has very minimal dependencies by design. For example, it does not depend on arrow or parquet crates or display libraries. -The `tpchgen-cli` crate is designed to include many useful features, and thus +The `spatialbench-cli` crate is designed to include many useful features, and thus has many more dependencies. ## Speed diff --git a/Cargo.toml b/Cargo.toml index c5433a9..04238a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,16 +1,16 @@ [workspace] -members = [ "tpchgen" , "tpchgen-arrow", "tpchgen-cli"] +members = ["spatialbench", "spatialbench-arrow", "spatialbench-cli"] resolver = "2" [workspace.package] authors = ["clflushopt", "alamb"] edition = "2021" -homepage = "https://github.com/clflushopt/tpchgen-rs" +homepage = "https://github.com/wherobots/sedona-spatialbench" license = "Apache-2.0" readme = "README.md" -repository = "https://github.com/clflushopt/tpchgen-rs" +repository = "https://github.com/wherobots/sedona-spatialbench" version = "1.1.0" [workspace.dependencies] diff --git a/README.md b/README.md index 92a4a60..e22f843 100644 --- a/README.md +++ b/README.md @@ -55,14 +55,14 @@ cargo build --release Alternatively, install it directly using: ```bash -cargo install --path ./tpchgen-cli +cargo install --path ./spatialbench-cli ``` ### Notes -- The core generator logic lives in the tpchgen crate. -- Geometry-aware logic is in tpchgen-arrow and integrated via Arrow-based schemas. -- The spatial extension modules like the Spider geometry generator reside in [spider.rs](https://github.com/wherobots/sedona-spatialbench/blob/main/tpchgen/src/spider.rs). +- The core generator logic lives in the spatialbench crate. +- Geometry-aware logic is in spatialbench-arrow and integrated via Arrow-based schemas. +- The spatial extension modules like the Spider geometry generator reside in [spider.rs](https://github.com/wherobots/sedona-spatialbench/blob/main/spatialbench/src/spider.rs). - The generator supports output formats like .tbl and Apache Parquet via the Arrow writer. For contribution or debugging, refer to the [ARCHITECTURE.md](https://github.com/wherobots/sedona-spatialbench/blob/main/ARCHITECTURE.md) guide. @@ -72,13 +72,13 @@ For contribution or debugging, refer to the [ARCHITECTURE.md](https://github.com #### Generate All Tables (Scale Factor 1) ```bash -tpchgen-cli -s 1 --format=parquet +spatialbench-cli -s 1 --format=parquet ``` #### Generate Individual Tables ```bash -tpchgen-cli -s 1 --format=parquet --tables trip,building --output-dir sf1-parquet +spatialbench-cli -s 1 --format=parquet --tables trip,building --output-dir sf1-parquet ``` #### Partitioned Output Example @@ -86,13 +86,13 @@ tpchgen-cli -s 1 --format=parquet --tables trip,building --output-dir sf1-parque ```bash for PART in $(seq 1 4); do mkdir part-$PART - tpchgen-cli -s 10 --tables trip,building --output-dir part-$PART --parts 4 --part $PART + spatialbench-cli -s 10 --tables trip,building --output-dir part-$PART --parts 4 --part $PART done ``` ## SpatialBench Spider Data Generator -SpatialBench includes a synthetic spatial data generator ([spider.rs](https://github.com/wherobots/sedona-spatialbench/blob/main/tpchgen/src/spider.rs)) for creating: +SpatialBench includes a synthetic spatial data generator ([spider.rs](https://github.com/wherobots/sedona-spatialbench/blob/main/spatialbench/src/spider.rs)) for creating: - Points - Rectangles (boxes) - Polygons diff --git a/benchmarks/BENCHMARKS.md b/benchmarks/BENCHMARKS.md index 5a6f922..b86da84 100644 --- a/benchmarks/BENCHMARKS.md +++ b/benchmarks/BENCHMARKS.md @@ -13,11 +13,11 @@ limited by local disk I/O. For example: ```shell # Generate SF=100, about 100GB of data, piped to /dev/null, reporting statistics -tpchgen-cli -s 100 --stdout | pv -arb > /dev/null +spatialbench-cli -s 100 --stdout | pv -arb > /dev/null # Outputs something similar to # 106GiB [3.09GiB/s] (3.09GiB/s) # For parquet -tpchgen-cli -s 100 --format=parquet --stdout | pv -arb > /dev/null +spatialbench-cli -s 100 --format=parquet --stdout | pv -arb > /dev/null # 38.2GiB [ 865MiB/s] ( 865MiB/s) ``` @@ -49,7 +49,7 @@ single parquet file per table, with snappy page compression. Example command to create Scale Factor 10 ```shell -tpchgen-cli -s 10 --format=parquet +spatialbench-cli -s 10 --format=parquet ``` ## `parquet_duckdb.sh` @@ -130,7 +130,7 @@ single, uncompressed tbl file per table. Example command for SF=10 ```shell -tpchgen-cli -s 10 +spatialbench-cli -s 10 ``` ## `tbl_tpchgen_1.sh` @@ -145,7 +145,7 @@ Example command for SF=10 ```shell # Scale factor 10 -tpchgen-cli -s 10 --num-threads=1 +spatialbench-cli -s 10 --num-threads=1 ``` ## `tbl_dbgen.sh` @@ -248,9 +248,9 @@ dd if=/dev/zero of=/data/test1.img bs=1G count=10 oflag=dsync ## install `tpchgen-rs` ```shell cd /data -git clone [email protected]:clflushopt/tpchgen-rs.git -cd tpchgen-rs -cargo install --path tpchgen-cli +git clone [email protected]:clflushopt/spatialbench-rs.git +cd spatialbench-rs +cargo install --path spatialbench-cli ``` diff --git a/benchmarks/parquet_tpchgen.sh b/benchmarks/parquet_spatialbench.sh similarity index 50% rename from benchmarks/parquet_tpchgen.sh rename to benchmarks/parquet_spatialbench.sh index f9c40ac..7536ade 100755 --- a/benchmarks/parquet_tpchgen.sh +++ b/benchmarks/parquet_spatialbench.sh @@ -1,11 +1,11 @@ #!/bin/bash # -# Runs the tpchgen-cli to generate parquet data at various scales, +# Runs the spatialbench-cli to generate parquet data at various scales, set -x set -e -LOGFILE=parquet_tpchgen.txt +LOGFILE=parquet_spatialbench.txt echo "***********Timings**********" >> $LOGFILE date >> $LOGFILE uname -a >> $LOGFILE @@ -13,5 +13,5 @@ uname -a >> $LOGFILE SCALE_FACTORS="1 10 100 1000" for sf in $SCALE_FACTORS ; do echo "SF=$sf" >> $LOGFILE - /usr/bin/time -a -o $LOGFILE tpchgen-cli -s $sf --output-dir=out_tpchgen --format=parquet + /usr/bin/time -a -o $LOGFILE spatialbench-cli -s $sf --output-dir=out_spatialbench --format=parquet done diff --git a/benchmarks/tbl_tpchgen.sh b/benchmarks/tbl_spatialbench.sh similarity index 53% rename from benchmarks/tbl_tpchgen.sh rename to benchmarks/tbl_spatialbench.sh index f89c836..518df1b 100755 --- a/benchmarks/tbl_tpchgen.sh +++ b/benchmarks/tbl_spatialbench.sh @@ -1,11 +1,11 @@ #!/bin/bash # -# Runs the tpchgen-cli to generate tbl data at various scales, +# Runs the spatialbench-cli to generate tbl data at various scales, set -x set -e -LOGFILE=tbl_tpchgen.txt +LOGFILE=tbl_spatialbench.txt echo "***********Timings**********" >> $LOGFILE date >> $LOGFILE uname -a >> $LOGFILE @@ -13,5 +13,5 @@ uname -a >> $LOGFILE SCALE_FACTORS="1 10 100 1000" for sf in $SCALE_FACTORS ; do echo "SF=$sf" >> $LOGFILE - /usr/bin/time -a -o $LOGFILE tpchgen-cli -s $sf --output-dir=out_tpchgen + /usr/bin/time -a -o $LOGFILE spatialbench-cli -s $sf --output-dir=out_spatialbench done diff --git a/benchmarks/tbl_tpchgen_1.sh b/benchmarks/tbl_spatialbench_1.sh similarity index 59% rename from benchmarks/tbl_tpchgen_1.sh rename to benchmarks/tbl_spatialbench_1.sh index 00f735b..ae325b2 100755 --- a/benchmarks/tbl_tpchgen_1.sh +++ b/benchmarks/tbl_spatialbench_1.sh @@ -1,12 +1,12 @@ #!/bin/bash # -# Runs the tpchgen-cli, restricted to a single core +# Runs the spatialbench-cli, restricted to a single core # to generate tbl data at various scales set -x set -e -LOGFILE=tbl_tpchgen_1.txt +LOGFILE=tbl_spatialbench_1.txt echo "***********Timings**********" >> $LOGFILE date >> $LOGFILE uname -a >> $LOGFILE @@ -15,5 +15,5 @@ uname -a >> $LOGFILE SCALE_FACTORS="1000" for sf in $SCALE_FACTORS ; do echo "SF=$sf" >> $LOGFILE - /usr/bin/time -a -o $LOGFILE tpchgen-cli --num-threads=1 -s $sf --output-dir=out_tpchgen + /usr/bin/time -a -o $LOGFILE spatialbench-cli --num-threads=1 -s $sf --output-dir=out_spatialbench done diff --git a/tpchgen-arrow/Cargo.toml b/spatialbench-arrow/Cargo.toml similarity index 72% rename from tpchgen-arrow/Cargo.toml rename to spatialbench-arrow/Cargo.toml index 8132204..47e9d14 100644 --- a/tpchgen-arrow/Cargo.toml +++ b/spatialbench-arrow/Cargo.toml @@ -1,16 +1,16 @@ [package] -name = "tpchgen-arrow" +name = "spatialbench-arrow" version = "1.1.0" edition = "2024" authors = ["clflushopt", "alamb"] description = "TPC-H data generator into Apache Arrow format" -repository = "https://github.com/clflushopt/tpchgen-rs" +repository = "https://github.com/wherobots/sedona-spatialbench" readme = "README.md" license = "Apache-2.0" [dependencies] arrow = { version = "54.0.0", default-features = false, features = ["prettyprint"] } -tpchgen = { path = "../tpchgen", version = "1.1.0" } +spatialbench = { path = "../spatialbench", version = "1.1.0" } geo = { workspace = true } geozero = { workspace = true } [dev-dependencies] diff --git a/tpchgen-arrow/LICENSE b/spatialbench-arrow/LICENSE similarity index 100% rename from tpchgen-arrow/LICENSE rename to spatialbench-arrow/LICENSE diff --git a/tpchgen-arrow/README.md b/spatialbench-arrow/README.md similarity index 88% rename from tpchgen-arrow/README.md rename to spatialbench-arrow/README.md index bdccec7..4abfc0e 100644 --- a/tpchgen-arrow/README.md +++ b/spatialbench-arrow/README.md @@ -20,4 +20,4 @@ This crate ensures correct results using two methods. Please see [CONTRIBUTING.md] for more information on how to contribute to this project. -[CONTRIBUTING.md]: https://github.com/clflushopt/tpchgen-rs/blob/main/CONTRIBUTING.md \ No newline at end of file +[CONTRIBUTING.md]: https://github.com/wherobots/sedona-spatialbench/blob/main/CONTRIBUTING.md \ No newline at end of file diff --git a/tpchgen-arrow/src/building.rs b/spatialbench-arrow/src/building.rs similarity index 91% rename from tpchgen-arrow/src/building.rs rename to spatialbench-arrow/src/building.rs index 1814edd..8d16aba 100644 --- a/tpchgen-arrow/src/building.rs +++ b/spatialbench-arrow/src/building.rs @@ -4,17 +4,17 @@ use arrow::array::{BinaryArray, Int64Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use geo::Geometry; use geozero::{CoordDimensions, ToWkb}; +use spatialbench::generators::{BuildingGenerator, BuildingGeneratorIterator}; use std::sync::{Arc, LazyLock}; -use tpchgen::generators::{BuildingGenerator, BuildingGeneratorIterator}; /// Generate [`Building`]s in [`RecordBatch`] format /// -/// [`Building`]: tpchgen::generators::Building +/// [`Building`]: spatialbench::generators::Building /// /// # Example /// ``` -/// # use tpchgen::generators::{BuildingGenerator}; -/// # use tpchgen_arrow::BuildingArrow; +/// # use spatialbench::generators::{BuildingGenerator}; +/// # use spatialbench_arrow::BuildingArrow; /// /// // Create a SF=1.0 generator and wrap it in an Arrow generator /// let generator = BuildingGenerator::new(1.0, 1, 1); diff --git a/tpchgen-arrow/src/conversions.rs b/spatialbench-arrow/src/conversions.rs similarity index 94% rename from tpchgen-arrow/src/conversions.rs rename to spatialbench-arrow/src/conversions.rs index b55ef4d..2e51719 100644 --- a/tpchgen-arrow/src/conversions.rs +++ b/spatialbench-arrow/src/conversions.rs @@ -1,9 +1,9 @@ //! Routines to convert TPCH types to Arrow types use arrow::array::{StringViewArray, StringViewBuilder}; +use spatialbench::dates::TPCHDate; +use spatialbench::decimal::TPCHDecimal; use std::fmt::Write; -use tpchgen::dates::TPCHDate; -use tpchgen::decimal::TPCHDecimal; /// Convert a TPCHDecimal to an Arrow Decimal(15,2) #[inline(always)] @@ -20,7 +20,7 @@ pub fn to_arrow_decimal(value: TPCHDecimal) -> i128 { /// /// ``` /// use chrono::NaiveDate; -/// use tpchgen::dates::TPCHDate; +/// use spatialbench::dates::TPCHDate; /// let arrow_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); /// let tpch_epoch = NaiveDate::from_ymd_opt(1992, 1, 1).unwrap(); /// // the difference between the two epochs is 8035 days @@ -68,7 +68,7 @@ where #[cfg(test)] mod tests { use super::*; - use tpchgen::dates::MIN_GENERATE_DATE; + use spatialbench::dates::MIN_GENERATE_DATE; #[test] fn test_to_arrow_decimal() { diff --git a/tpchgen-arrow/src/customer.rs b/spatialbench-arrow/src/customer.rs similarity index 95% rename from tpchgen-arrow/src/customer.rs rename to spatialbench-arrow/src/customer.rs index 4e2e6be..126bdbc 100644 --- a/tpchgen-arrow/src/customer.rs +++ b/spatialbench-arrow/src/customer.rs @@ -2,17 +2,17 @@ use crate::conversions::string_view_array_from_display_iter; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; use arrow::array::{Int64Array, RecordBatch, StringViewArray}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use spatialbench::generators::{CustomerGenerator, CustomerGeneratorIterator}; use std::sync::{Arc, LazyLock}; -use tpchgen::generators::{CustomerGenerator, CustomerGeneratorIterator}; /// Generate [`Customer`]s in [`RecordBatch`] format /// -/// [`Customer`]: tpchgen::generators::Customer +/// [`Customer`]: spatialbench::generators::Customer /// /// # Example /// ``` -/// # use tpchgen::generators::{CustomerGenerator}; -/// # use tpchgen_arrow::CustomerArrow; +/// # use spatialbench::generators::{CustomerGenerator}; +/// # use spatialbench_arrow::CustomerArrow; /// /// // Create a SF=1.0 generator and wrap it in an Arrow generator /// let generator = CustomerGenerator::new(1.0, 1, 1); diff --git a/tpchgen-arrow/src/driver.rs b/spatialbench-arrow/src/driver.rs similarity index 95% rename from tpchgen-arrow/src/driver.rs rename to spatialbench-arrow/src/driver.rs index af760a4..8294397 100644 --- a/tpchgen-arrow/src/driver.rs +++ b/spatialbench-arrow/src/driver.rs @@ -2,17 +2,17 @@ use crate::conversions::string_view_array_from_display_iter; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; use arrow::array::{Int64Array, RecordBatch, StringViewArray}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use spatialbench::generators::{DriverGenerator, DriverGeneratorIterator}; use std::sync::{Arc, LazyLock}; -use tpchgen::generators::{DriverGenerator, DriverGeneratorIterator}; /// Generate [`Driver`]s in [`RecordBatch`] format /// -/// [`Driver`]: tpchgen::generators::Driver +/// [`Driver`]: spatialbench::generators::Driver /// /// # Example: /// ``` -/// # use tpchgen::generators::{DriverGenerator}; -/// # use tpchgen_arrow::DriverArrow; +/// # use spatialbench::generators::{DriverGenerator}; +/// # use spatialbench_arrow::DriverArrow; /// /// // Create a SF=1.0 generator and wrap it in an Arrow generator /// let generator = DriverGenerator::new(1.0, 1, 1); diff --git a/spatialbench-arrow/src/lib.rs b/spatialbench-arrow/src/lib.rs new file mode 100644 index 0000000..08d2c3a --- /dev/null +++ b/spatialbench-arrow/src/lib.rs @@ -0,0 +1,61 @@ +//! Generate TPCH data as Arrow RecordBatches +//! +//! This crate provides generators for TPCH tables that directly produces +//! Arrow [`RecordBatch`]es. This is significantly faster than generating TBL or CSV +//! files and then parsing them into Arrow. +//! +//! # Example +//! ``` +//! # use spatialbench::generators::TripGenerator; +//! # use spatialbench_arrow::TripArrow; +//! # use arrow::util::pretty::pretty_format_batches; +//! // Create a SF=1 generator for the LineItem table +//! let generator = TripGenerator::new(1.0, 1, 1); +//! let mut arrow_generator = TripArrow::new(generator) +//! .with_batch_size(10); +//! // The generator is a Rust iterator, producing RecordBatch +//! let batch = arrow_generator.next().unwrap(); +//! // compare the output by pretty printing it +//! let formatted_batches = pretty_format_batches(&[batch]).unwrap().to_string(); +//! assert_eq!(formatted_batches.lines().collect::<Vec<_>>(), vec![ +//! "+-----------+-----------+-------------+--------------+--------------+---------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+", +//! "| t_tripkey | t_custkey | t_driverkey | t_vehiclekey | t_pickuptime | t_dropofftime | t_fare | t_tip | t_totalamount | t_distance | t_pickuploc | t_dropoffloc |", +//! "+-----------+-----------+-------------+--------------+--------------+---------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+", +//! "| 1 | 21425 | 47 | 46 | 1997-07-24 | 1997-07-24 | 0.00034 | 0.00002 | 0.00037 | 0.00014 | 010100000000000000009f65c000000000008056c0 | 01010000003c13323f719f65c0c62bcff1f28856c0 |", +//! "| 2 | 17012 | 66 | 65 | 1997-12-24 | 1997-12-24 | 0.00003 | 0.00000 | 0.00004 | 0.00001 | 010100000000000000800165c000000000001835c0 | 0101000000ed03047c0f0165c00ee6b7aa721735c0 |", +//! "| 3 | 4454 | 68 | 67 | 1993-06-27 | 1993-06-27 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 010100000000000000007265c000000000809953c0 | 0101000000336b00b00e7265c02f695d509e9953c0 |", +//! "| 4 | 3875 | 82 | 81 | 1996-08-02 | 1996-08-02 | 0.00005 | 0.00000 | 0.00005 | 0.00002 | 010100000000000000800f56c00000000000c63bc0 | 01010000004a916d7e111056c0621ccaa6fdcb3bc0 |", +//! "| 5 | 23027 | 9 | 8 | 1996-08-23 | 1996-08-23 | 0.00002 | 0.00000 | 0.00003 | 0.00001 | 010100000000000000406460c00000000000da4640 | 0101000000acb0a6a1ae6460c0e1a5886c17db4640 |", +//! "| 6 | 4573 | 41 | 40 | 1994-11-16 | 1994-11-16 | 0.00003 | 0.00000 | 0.00003 | 0.00001 | 010100000000000000002666c000000000806f40c0 | 01010000006100a6303e2666c09f84c465c06d40c0 |", +//! "| 7 | 28319 | 60 | 59 | 1996-01-20 | 1996-01-20 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 010100000000000000002963c00000000000e040c0 | 010100000000000000002963c00000000000e040c0 |", +//! "| 8 | 23288 | 32 | 31 | 1995-01-09 | 1995-01-10 | 0.00003 | 0.00000 | 0.00003 | 0.00001 | 010100000000000000008056c000000000c03955c0 | 01010000007fcc20a00d8156c0daf2ab4bd83955c0 |", +//! "| 9 | 17744 | 100 | 99 | 1993-10-13 | 1993-10-13 | 0.00005 | 0.00001 | 0.00007 | 0.00003 | 010100000000000000005366c00000000000e050c0 | 010100000065523404e75266c045ff5c8406e250c0 |", +//! "| 10 | 11800 | 98 | 97 | 1994-11-08 | 1994-11-08 | 0.00001 | 0.00000 | 0.00001 | 0.00000 | 010100000000000000008066c000000000c07456c0 | 01010000001ded0e6fe27f66c001744f41837456c0 |", +//! "+-----------+-----------+-------------+--------------+--------------+---------------+---------+---------+---------------+------------+--------------------------------------------+--------------------------------------------+" +//! ]); +//! ``` + +mod building; +pub mod conversions; +mod customer; +mod driver; +mod trip; +mod vehicle; +mod zone; + +use arrow::array::RecordBatch; +use arrow::datatypes::SchemaRef; +pub use building::BuildingArrow; +pub use customer::CustomerArrow; +pub use driver::DriverArrow; +pub use trip::TripArrow; +pub use vehicle::VehicleArrow; +pub use zone::ZoneArrow; + +/// Iterator of Arrow [`RecordBatch`] that also knows its schema +pub trait RecordBatchIterator: Iterator<Item = RecordBatch> + Send { + fn schema(&self) -> &SchemaRef; +} + +/// The default number of rows in each Batch +pub const DEFAULT_BATCH_SIZE: usize = 8 * 1000; diff --git a/tpchgen-arrow/src/trip.rs b/spatialbench-arrow/src/trip.rs similarity index 98% rename from tpchgen-arrow/src/trip.rs rename to spatialbench-arrow/src/trip.rs index a4fc7ce..8305dcb 100644 --- a/tpchgen-arrow/src/trip.rs +++ b/spatialbench-arrow/src/trip.rs @@ -4,8 +4,8 @@ use arrow::array::{BinaryArray, Date32Array, Int64Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use geo::Geometry; use geozero::{CoordDimensions, ToWkb}; +use spatialbench::generators::{Trip, TripGenerator, TripGeneratorIterator}; use std::sync::{Arc, LazyLock, Mutex}; -use tpchgen::generators::{Trip, TripGenerator, TripGeneratorIterator}; // Thread-safe wrapper for TripGeneratorIterator struct ThreadSafeTripGenerator { diff --git a/tpchgen-arrow/src/vehicle.rs b/spatialbench-arrow/src/vehicle.rs similarity index 95% rename from tpchgen-arrow/src/vehicle.rs rename to spatialbench-arrow/src/vehicle.rs index bc09cda..8e39013 100644 --- a/tpchgen-arrow/src/vehicle.rs +++ b/spatialbench-arrow/src/vehicle.rs @@ -2,17 +2,17 @@ use crate::conversions::string_view_array_from_display_iter; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; use arrow::array::{Int64Array, RecordBatch, StringViewArray}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use spatialbench::generators::{VehicleGenerator, VehicleGeneratorIterator}; use std::sync::{Arc, LazyLock}; -use tpchgen::generators::{VehicleGenerator, VehicleGeneratorIterator}; /// Generate [`Vehicle`]s in [`RecordBatch`] format /// -/// [`Vehicle`]: tpchgen::generators::Vehicle +/// [`Vehicle`]: spatialbench::generators::Vehicle /// /// # Example /// ``` -/// # use tpchgen::generators::{VehicleGenerator}; -/// # use tpchgen_arrow::VehicleArrow; +/// # use spatialbench::generators::{VehicleGenerator}; +/// # use spatialbench_arrow::VehicleArrow; /// /// // Create a SF=1.0 generator and wrap it in an Arrow generator /// let generator = VehicleGenerator::new(1.0, 1, 1); diff --git a/tpchgen-arrow/src/zone.rs b/spatialbench-arrow/src/zone.rs similarity index 94% rename from tpchgen-arrow/src/zone.rs rename to spatialbench-arrow/src/zone.rs index 9b49abf..06794b3 100644 --- a/tpchgen-arrow/src/zone.rs +++ b/spatialbench-arrow/src/zone.rs @@ -3,17 +3,17 @@ use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; use arrow::array::{BinaryArray, Int64Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use geozero::{CoordDimensions, ToWkb}; +use spatialbench::generators::{ZoneGenerator, ZoneGeneratorIterator}; use std::sync::{Arc, LazyLock}; -use tpchgen::generators::{ZoneGenerator, ZoneGeneratorIterator}; /// Generate [`Zone`]s in [`RecordBatch`] format /// -/// [`Zone`]: tpchgen::generators::Zone +/// [`Zone`]: spatialbench::generators::Zone /// /// # Example /// ``` -/// # use tpchgen::generators::{ZoneGenerator}; -/// # use tpchgen_arrow::ZoneArrow; +/// # use spatialbench::generators::{ZoneGenerator}; +/// # use spatialbench_arrow::ZoneArrow; /// /// // Create a SF=1.0 generator and wrap it in an Arrow generator /// let generator = ZoneGenerator::new(0.001, 1, 1); diff --git a/tpchgen-arrow/tests/reparse.rs b/spatialbench-arrow/tests/reparse.rs similarity index 96% rename from tpchgen-arrow/tests/reparse.rs rename to spatialbench-arrow/tests/reparse.rs index ad1ef54..c589195 100644 --- a/tpchgen-arrow/tests/reparse.rs +++ b/spatialbench-arrow/tests/reparse.rs @@ -3,17 +3,17 @@ use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; -use std::io::Write; -use std::sync::Arc; -use tpchgen::csv::{BuildingCsv, CustomerCsv, DriverCsv, TripCsv, VehicleCsv, ZoneCsv}; -use tpchgen::generators::{ +use spatialbench::csv::{BuildingCsv, CustomerCsv, DriverCsv, TripCsv, VehicleCsv, ZoneCsv}; +use spatialbench::generators::{ Building, BuildingGenerator, Customer, CustomerGenerator, Driver, DriverGenerator, Trip, TripGenerator, Vehicle, VehicleGenerator, Zone, ZoneGenerator, }; -use tpchgen_arrow::{ +use spatialbench_arrow::{ BuildingArrow, CustomerArrow, DriverArrow, RecordBatchIterator, TripArrow, VehicleArrow, ZoneArrow, }; +use std::io::Write; +use std::sync::Arc; use arrow::array::Array; use arrow::array::BinaryArray; @@ -142,10 +142,6 @@ impl Test { // Skip reparsing for Binary-containing schemas if contains_binary { - eprintln!( - "Skipping reparsing test for schema with Binary type: {:?}", - schema - ); continue; } diff --git a/tpchgen-cli/Cargo.toml b/spatialbench-cli/Cargo.toml similarity index 75% rename from tpchgen-cli/Cargo.toml rename to spatialbench-cli/Cargo.toml index f482cbc..bfcb6a5 100644 --- a/tpchgen-cli/Cargo.toml +++ b/spatialbench-cli/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "tpchgen-cli" +name = "spatialbench-cli" version = "1.1.0" authors = { workspace = true } description = "Blazing fast pure Rust TPC-H data generator command line tool." @@ -13,8 +13,8 @@ repository = { workspace = true } arrow = "54.3.0" parquet = "54.3.0" clap = { version = "4.5.32", features = ["derive"] } -tpchgen = { path = "../tpchgen", version = "1.1.0"} -tpchgen-arrow = { path = "../tpchgen-arrow", version = "1.1.0" } +spatialbench = { path = "../spatialbench", version = "1.1.0"} +spatialbench-arrow = { path = "../spatialbench-arrow", version = "1.1.0" } tokio = { version = "1.44.1", features = ["full"]} futures = "0.3.31" num_cpus = "1.0" diff --git a/tpchgen-cli/README.md b/spatialbench-cli/README.md similarity index 80% rename from tpchgen-cli/README.md rename to spatialbench-cli/README.md index af0a161..7dd1c65 100644 --- a/tpchgen-cli/README.md +++ b/spatialbench-cli/README.md @@ -1,6 +1,6 @@ -# TPC-H Data Generator CLI +# SpatialBench Data Generator CLI -See the main [README.md](https://github.com/clflushopt/tpchgen-rs) for full documentation. +See the main [README.md](https://github.com/wherobots/sedona-spatialbench) for full documentation. ## Installation @@ -8,7 +8,7 @@ See the main [README.md](https://github.com/clflushopt/tpchgen-rs) for full docu Install this tool with Python: ```shell -pip install tpchgen-cli +pip install spatialbench-cli ``` ### Install Using Rust @@ -17,19 +17,19 @@ pip install tpchgen-cli ```shell curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -cargo install tpchgen-cli +cargo install spatialbench-cli ``` ## CLI Usage -We tried to make the `tpchgen-cli` experience as close to `dbgen` as possible for no other +We tried to make the `spatialbench-cli` experience as close to `dbgen` as possible for no other reason than maybe make it easier for you to have a drop-in replacement. ```shell -$ tpchgen-cli -h +$ spatialbench-cli -h TPC-H Data Generator -Usage: tpchgen-cli [OPTIONS] +Usage: spatialbench-cli [OPTIONS] Options: -s, --scale-factor <SCALE_FACTOR> @@ -58,5 +58,5 @@ Options: For example generating a dataset with a scale factor of 1 (1GB) can be done like this: ```shell -$ tpchgen-cli -s 1 --output-dir=/tmp/tpch +$ spatialbench-cli -s 1 --output-dir=/tmp/tpch ``` diff --git a/tpchgen-cli/pyproject.toml b/spatialbench-cli/pyproject.toml similarity index 91% rename from tpchgen-cli/pyproject.toml rename to spatialbench-cli/pyproject.toml index ba598db..a58ef5a 100644 --- a/tpchgen-cli/pyproject.toml +++ b/spatialbench-cli/pyproject.toml @@ -3,7 +3,7 @@ requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" [project] -name = "tpchgen-cli" +name = "spatialbench-cli" dynamic = ["version"] description = "Python CLI for TPC-H data generator" requires-python = ">=3.8" diff --git a/tpchgen-cli/src/csv.rs b/spatialbench-cli/src/csv.rs similarity index 93% rename from tpchgen-cli/src/csv.rs rename to spatialbench-cli/src/csv.rs index 93048d0..7f9ca3b 100644 --- a/tpchgen-cli/src/csv.rs +++ b/spatialbench-cli/src/csv.rs @@ -1,11 +1,11 @@ //! Implementations of [`Source`] for generating data in TBL format use super::generate::Source; -use std::io::Write; -use tpchgen::csv::{BuildingCsv, CustomerCsv, DriverCsv, TripCsv, VehicleCsv, ZoneCsv}; -use tpchgen::generators::{ +use spatialbench::csv::{BuildingCsv, CustomerCsv, DriverCsv, TripCsv, VehicleCsv, ZoneCsv}; +use spatialbench::generators::{ BuildingGenerator, CustomerGenerator, DriverGenerator, TripGenerator, VehicleGenerator, ZoneGenerator, }; +use std::io::Write; /// Define a Source that writes the table in CSV format macro_rules! define_csv_source { diff --git a/tpchgen-cli/src/generate.rs b/spatialbench-cli/src/generate.rs similarity index 99% rename from tpchgen-cli/src/generate.rs rename to spatialbench-cli/src/generate.rs index 81ff97c..bbe3cef 100644 --- a/tpchgen-cli/src/generate.rs +++ b/spatialbench-cli/src/generate.rs @@ -12,7 +12,7 @@ use tokio::task::JoinSet; /// Something that knows how to generate data into a buffer /// -/// For example, this is implemented for the different generators in the tpchgen +/// For example, this is implemented for the different generators in the spatialbench /// crate pub trait Source: Send { /// generates the data for this generator into the buffer, returning the buffer. diff --git a/tpchgen-cli/src/main.rs b/spatialbench-cli/src/main.rs similarity index 98% rename from tpchgen-cli/src/main.rs rename to spatialbench-cli/src/main.rs index 0539fbc..83670e0 100644 --- a/tpchgen-cli/src/main.rs +++ b/spatialbench-cli/src/main.rs @@ -6,7 +6,7 @@ //! //! ``` //! USAGE: -//! tpchgen-cli [OPTIONS] +//! spatialbench-cli [OPTIONS] //! //! OPTIONS: //! -h, --help Prints help information @@ -31,13 +31,13 @@ //! # Examples //! ``` //! # see all info output -//! tpchgen-cli -s 1 -v +//! spatialbench-cli -s 1 -v //! //! # same thing using RUST_LOG -//! RUST_LOG=info tpchgen-cli -s 1 +//! RUST_LOG=info spatialbench-cli -s 1 //! //! # see all debug output -//! RUST_LOG=debug tpchgen -s 1 +//! RUST_LOG=debug spatialbench -s 1 //! ``` mod csv; mod generate; @@ -54,25 +54,25 @@ use ::parquet::basic::Compression; use clap::builder::TypedValueParser; use clap::{Parser, ValueEnum}; use log::{debug, info, LevelFilter}; -use std::fmt::Display; -use std::fs::{self, File}; -use std::io::{self, BufWriter, Stdout, Write}; -use std::path::PathBuf; -use std::str::FromStr; -use std::time::Instant; -use tpchgen::distribution::Distributions; -use tpchgen::generators::{ +use spatialbench::distribution::Distributions; +use spatialbench::generators::{ BuildingGenerator, CustomerGenerator, DriverGenerator, TripGenerator, VehicleGenerator, ZoneGenerator, }; -use tpchgen::text::TextPool; -use tpchgen_arrow::{ +use spatialbench::text::TextPool; +use spatialbench_arrow::{ BuildingArrow, CustomerArrow, DriverArrow, RecordBatchIterator, TripArrow, VehicleArrow, ZoneArrow, }; +use std::fmt::Display; +use std::fs::{self, File}; +use std::io::{self, BufWriter, Stdout, Write}; +use std::path::PathBuf; +use std::str::FromStr; +use std::time::Instant; #[derive(Parser)] -#[command(name = "tpchgen")] +#[command(name = "spatialbench")] #[command(version)] #[command(about = "TPC-H Data Generator", long_about = None)] struct Cli { diff --git a/tpchgen-cli/src/parquet.rs b/spatialbench-cli/src/parquet.rs similarity index 99% rename from tpchgen-cli/src/parquet.rs rename to spatialbench-cli/src/parquet.rs index 767aff4..53b60f4 100644 --- a/tpchgen-cli/src/parquet.rs +++ b/spatialbench-cli/src/parquet.rs @@ -10,11 +10,11 @@ use parquet::basic::Compression; use parquet::file::properties::WriterProperties; use parquet::file::writer::SerializedFileWriter; use parquet::schema::types::SchemaDescPtr; +use spatialbench_arrow::RecordBatchIterator; use std::io; use std::io::Write; use std::sync::Arc; use tokio::sync::mpsc::{Receiver, Sender}; -use tpchgen_arrow::RecordBatchIterator; pub trait IntoSize { /// Convert the object into a size diff --git a/tpchgen-cli/src/statistics.rs b/spatialbench-cli/src/statistics.rs similarity index 100% rename from tpchgen-cli/src/statistics.rs rename to spatialbench-cli/src/statistics.rs diff --git a/tpchgen-cli/src/tbl.rs b/spatialbench-cli/src/tbl.rs similarity index 97% rename from tpchgen-cli/src/tbl.rs rename to spatialbench-cli/src/tbl.rs index 29de2a3..b7019c7 100644 --- a/tpchgen-cli/src/tbl.rs +++ b/spatialbench-cli/src/tbl.rs @@ -1,11 +1,11 @@ //! Implementations of [`Source`] for generating data in TBL format use super::generate::Source; -use std::io::Write; -use tpchgen::generators::{ +use spatialbench::generators::{ BuildingGenerator, CustomerGenerator, DriverGenerator, TripGenerator, VehicleGenerator, ZoneGenerator, }; +use std::io::Write; /// Define a Source that writes the table in TBL format macro_rules! define_tbl_source { diff --git a/tpchgen/Cargo.toml b/spatialbench/Cargo.toml similarity index 87% rename from tpchgen/Cargo.toml rename to spatialbench/Cargo.toml index d20ea7a..e679fec 100644 --- a/tpchgen/Cargo.toml +++ b/spatialbench/Cargo.toml @@ -1,8 +1,8 @@ [package] -name = "tpchgen" +name = "spatialbench" authors = ["clflushopt", "alamb"] description = "Blazing fast pure Rust no dependency TPC-H data generation library." -repository = "https://github.com/clflushopt/tpchgen-rs" +repository = "https://github.com/wherobots/sedona-spatialbench" readme = { workspace = true } version = { workspace = true } edition = { workspace = true } diff --git a/tpchgen/data/README.md b/spatialbench/data/README.md similarity index 100% rename from tpchgen/data/README.md rename to spatialbench/data/README.md diff --git a/tpchgen/src/csv.rs b/spatialbench/src/csv.rs similarity index 93% rename from tpchgen/src/csv.rs rename to spatialbench/src/csv.rs index 946fdeb..b9b5a9d 100644 --- a/tpchgen/src/csv.rs +++ b/spatialbench/src/csv.rs @@ -8,8 +8,8 @@ use std::fmt::Display; /// /// # Example /// ``` -/// # use tpchgen::generators::VehicleGenerator; -/// # use tpchgen::csv::VehicleCsv; +/// # use spatialbench::generators::VehicleGenerator; +/// # use spatialbench::csv::VehicleCsv; /// # use std::fmt::Write; /// // Output the first 3 rows in CSV format /// let generator = VehicleGenerator::new(1.0, 1, 1); @@ -61,8 +61,8 @@ impl Display for VehicleCsv<'_> { /// /// # Example /// ``` -/// # use tpchgen::generators::DriverGenerator; -/// # use tpchgen::csv::DriverCsv; +/// # use spatialbench::generators::DriverGenerator; +/// # use spatialbench::csv::DriverCsv; /// # use std::fmt::Write; /// // Output the first 3 rows in CSV format /// let generator = DriverGenerator::new(1.0, 1, 1); @@ -115,8 +115,8 @@ impl Display for DriverCsv { /// /// # Example /// ``` -/// # use tpchgen::generators::CustomerGenerator; -/// # use tpchgen::csv::CustomerCsv; +/// # use spatialbench::generators::CustomerGenerator; +/// # use spatialbench::csv::CustomerCsv; /// # use std::fmt::Write; /// // Output the first 3 rows in CSV format /// let generator = CustomerGenerator::new(1.0, 1, 1); @@ -169,8 +169,8 @@ impl Display for CustomerCsv<'_> { /// /// # Example /// ``` -/// # use tpchgen::generators::TripGenerator; -/// # use tpchgen::csv::TripCsv; +/// # use spatialbench::generators::TripGenerator; +/// # use spatialbench::csv::TripCsv; /// # use std::fmt::Write; /// // Output the first 3 rows in CSV format /// let generator = TripGenerator::new(1.0, 1, 1); @@ -222,8 +222,8 @@ impl Display for TripCsv { /// /// # Example /// ``` -/// # use tpchgen::generators::BuildingGenerator; -/// # use tpchgen::csv::BuildingCsv; +/// # use spatialbench::generators::BuildingGenerator; +/// # use spatialbench::csv::BuildingCsv; /// # use std::fmt::Write; /// // Output the first 3 rows in CSV format /// let generator = BuildingGenerator::new(1.0, 1, 1); @@ -264,8 +264,8 @@ impl Display for BuildingCsv<'_> { /// /// # Example /// ``` -/// # use tpchgen::generators::ZoneGenerator; -/// # use tpchgen::csv::ZoneCsv; +/// # use spatialbench::generators::ZoneGenerator; +/// # use spatialbench::csv::ZoneCsv; /// # use std::fmt::Write; /// // Output the first 3 rows in CSV format /// let generator = ZoneGenerator::new(0.001, 1, 1); diff --git a/tpchgen/src/dates.rs b/spatialbench/src/dates.rs similarity index 99% rename from tpchgen/src/dates.rs rename to spatialbench/src/dates.rs index c07b1c5..2bc59e5 100644 --- a/tpchgen/src/dates.rs +++ b/spatialbench/src/dates.rs @@ -142,7 +142,7 @@ impl RandomTimeOfDay { /// /// # Example /// ``` -/// # use tpchgen::dates::{TPCHDate, MIN_GENERATE_DATE}; +/// # use spatialbench::dates::{TPCHDate, MIN_GENERATE_DATE}; /// let date = TPCHDate::new(MIN_GENERATE_DATE + 41, 0, 0, 0); /// // Convert the date to y/m/d fields /// assert_eq!((92,2,11), date.to_ymd()); diff --git a/tpchgen/src/decimal.rs b/spatialbench/src/decimal.rs similarity index 98% rename from tpchgen/src/decimal.rs rename to spatialbench/src/decimal.rs index dff1172..bcf2a27 100644 --- a/tpchgen/src/decimal.rs +++ b/spatialbench/src/decimal.rs @@ -16,7 +16,7 @@ impl TPCHDecimal { /// /// # Example /// ``` - /// use tpchgen::decimal::TPCHDecimal; + /// use spatialbench::decimal::TPCHDecimal; /// let decimal = TPCHDecimal::new(1234); /// assert_eq!(decimal.to_string(), "12.34"); /// ``` diff --git a/tpchgen/src/distribution.rs b/spatialbench/src/distribution.rs similarity index 100% rename from tpchgen/src/distribution.rs rename to spatialbench/src/distribution.rs diff --git a/tpchgen/src/dists.dss b/spatialbench/src/dists.dss similarity index 100% rename from tpchgen/src/dists.dss rename to spatialbench/src/dists.dss diff --git a/tpchgen/src/generators.rs b/spatialbench/src/generators.rs similarity index 100% rename from tpchgen/src/generators.rs rename to spatialbench/src/generators.rs diff --git a/tpchgen/src/kde.rs b/spatialbench/src/kde.rs similarity index 100% rename from tpchgen/src/kde.rs rename to spatialbench/src/kde.rs diff --git a/tpchgen/src/lib.rs b/spatialbench/src/lib.rs similarity index 98% rename from tpchgen/src/lib.rs rename to spatialbench/src/lib.rs index 262de8e..a4f8056 100644 --- a/tpchgen/src/lib.rs +++ b/spatialbench/src/lib.rs @@ -6,7 +6,7 @@ //! //! # Example: TBL output format //! ``` -//! # use tpchgen::generators::TripGenerator; +//! # use spatialbench::generators::TripGenerator; //! // Create Generator for the TRIP table at Scale Factor 1 (SF 1) //! let scale_factor = 1.0; //! let part = 1; diff --git a/tpchgen/src/queries.rs b/spatialbench/src/queries.rs similarity index 100% rename from tpchgen/src/queries.rs rename to spatialbench/src/queries.rs diff --git a/tpchgen/src/random.rs b/spatialbench/src/random.rs similarity index 100% rename from tpchgen/src/random.rs rename to spatialbench/src/random.rs diff --git a/tpchgen/src/spider.rs b/spatialbench/src/spider.rs similarity index 100% rename from tpchgen/src/spider.rs rename to spatialbench/src/spider.rs diff --git a/tpchgen/src/spider_presets.rs b/spatialbench/src/spider_presets.rs similarity index 100% rename from tpchgen/src/spider_presets.rs rename to spatialbench/src/spider_presets.rs diff --git a/tpchgen/src/text.rs b/spatialbench/src/text.rs similarity index 100% rename from tpchgen/src/text.rs rename to spatialbench/src/text.rs diff --git a/tpchgen/tests/integration_tests.rs b/spatialbench/tests/integration_tests.rs similarity index 100% rename from tpchgen/tests/integration_tests.rs rename to spatialbench/tests/integration_tests.rs diff --git a/tpchgen-arrow/src/lib.rs b/tpchgen-arrow/src/lib.rs deleted file mode 100644 index aca6367..0000000 --- a/tpchgen-arrow/src/lib.rs +++ /dev/null @@ -1,60 +0,0 @@ -//! Generate TPCH data as Arrow RecordBatches -//! -//! This crate provides generators for TPCH tables that directly produces -//! Arrow [`RecordBatch`]es. This is significantly faster than generating TBL or CSV -//! files and then parsing them into Arrow. -//! -//! # Example - -// # use tpchgen::generators::LineItemGenerator; -// # use tpchgen_arrow::LineItemArrow; -// # use arrow::util::pretty::pretty_format_batches; -// // Create a SF=1 generator for the LineItem table -// let generator = LineItemGenerator::new(1.0, 1, 1); -// let mut arrow_generator = LineItemArrow::new(generator) -// .with_batch_size(10); -// // The generator is a Rust iterator, producing RecordBatch -// let batch = arrow_generator.next().unwrap(); -// // compare the output by pretty printing it -// let formatted_batches = pretty_format_batches(&[batch]).unwrap().to_string(); -// assert_eq!(formatted_batches.lines().collect::<Vec<_>>(), vec![ -// "+------------+-----------+-----------+--------------+------------+-----------------+------------+-------+--------------+--------------+------------+--------------+---------------+-------------------+------------+-------------------------------------+", -// "| l_orderkey | l_vehiclekey | l_suppkey | l_linenumber | l_quantity | l_extendedprice | l_discount | l_tax | l_returnflag | l_linestatus | l_shipdate | l_commitdate | l_receiptdate | l_shipinstruct | l_shipmode | l_comment |", -// "+------------+-----------+-----------+--------------+------------+-----------------+------------+-------+--------------+--------------+------------+--------------+---------------+-------------------+------------+-------------------------------------+", -// "| 1 | 155190 | 7706 | 1 | 17.00 | 21168.23 | 0.04 | 0.02 | N | O | 1996-03-13 | 1996-02-12 | 1996-03-22 | DELIVER IN PERSON | TRUCK | egular courts above the |", -// "| 1 | 67310 | 7311 | 2 | 36.00 | 45983.16 | 0.09 | 0.06 | N | O | 1996-04-12 | 1996-02-28 | 1996-04-20 | TAKE BACK RETURN | MAIL | ly final dependencies: slyly bold |", -// "| 1 | 63700 | 3701 | 3 | 8.00 | 13309.60 | 0.10 | 0.02 | N | O | 1996-01-29 | 1996-03-05 | 1996-01-31 | TAKE BACK RETURN | REG AIR | riously. regular, express dep |", -// "| 1 | 2132 | 4633 | 4 | 28.00 | 28955.64 | 0.09 | 0.06 | N | O | 1996-04-21 | 1996-03-30 | 1996-05-16 | NONE | AIR | lites. fluffily even de |", -// "| 1 | 24027 | 1534 | 5 | 24.00 | 22824.48 | 0.10 | 0.04 | N | O | 1996-03-30 | 1996-03-14 | 1996-04-01 | NONE | FOB | pending foxes. slyly re |", -// "| 1 | 15635 | 638 | 6 | 32.00 | 49620.16 | 0.07 | 0.02 | N | O | 1996-01-30 | 1996-02-07 | 1996-02-03 | DELIVER IN PERSON | MAIL | arefully slyly ex |", -// "| 2 | 106170 | 1191 | 1 | 38.00 | 44694.46 | 0.00 | 0.05 | N | O | 1997-01-28 | 1997-01-14 | 1997-02-02 | TAKE BACK RETURN | RAIL | ven requests. deposits breach a |", -// "| 3 | 4297 | 1798 | 1 | 45.00 | 54058.05 | 0.06 | 0.00 | R | F | 1994-02-02 | 1994-01-04 | 1994-02-23 | NONE | AIR | ongside of the furiously brave acco |", -// "| 3 | 19036 | 6540 | 2 | 49.00 | 46796.47 | 0.10 | 0.00 | R | F | 1993-11-09 | 1993-12-20 | 1993-11-24 | TAKE BACK RETURN | RAIL | unusual accounts. eve |", -// "| 3 | 128449 | 3474 | 3 | 27.00 | 39890.88 | 0.06 | 0.07 | A | F | 1994-01-16 | 1993-11-22 | 1994-01-23 | DELIVER IN PERSON | SHIP | nal foxes wake. |", -// "+------------+-----------+-----------+--------------+------------+-----------------+------------+-------+--------------+--------------+------------+--------------+---------------+-------------------+------------+-------------------------------------+" -// ]); - -mod building; -pub mod conversions; -mod customer; -mod driver; -mod trip; -mod vehicle; -mod zone; - -use arrow::array::RecordBatch; -use arrow::datatypes::SchemaRef; -pub use building::BuildingArrow; -pub use customer::CustomerArrow; -pub use driver::DriverArrow; -pub use trip::TripArrow; -pub use vehicle::VehicleArrow; -pub use zone::ZoneArrow; - -/// Iterator of Arrow [`RecordBatch`] that also knows its schema -pub trait RecordBatchIterator: Iterator<Item = RecordBatch> + Send { - fn schema(&self) -> &SchemaRef; -} - -/// The default number of rows in each Batch -pub const DEFAULT_BATCH_SIZE: usize = 8 * 1000; diff --git a/tpchgen-rs-readme.md b/tpchgen-rs-readme.md index 9bc0807..c2a0b9a 100644 --- a/tpchgen-rs-readme.md +++ b/tpchgen-rs-readme.md @@ -23,12 +23,12 @@ Blazing fast [TPCH] benchmark data generator, in pure Rust with zero dependencie ### Install Using Python Install this tool with Python: ```shell -pip install tpchgen-cli +pip install spatialbench-cli ``` ```shell # create Scale Factor 10 (3.6GB, 8 files, 60M rows in lineitem) in 5 seconds on a modern laptop -tpchgen-cli -s 10 --format=parquet +spatialbench-cli -s 10 --format=parquet ``` ### Install Using Rust @@ -36,12 +36,12 @@ tpchgen-cli -s 10 --format=parquet ```shell curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -cargo install tpchgen-cli +cargo install spatialbench-cli ``` ```shell # create Scale Factor 10 (3.6GB, 8 files, 60M rows in lineitem) in 5 seconds on a modern laptop -tpchgen-cli -s 10 --format=parquet +spatialbench-cli -s 10 --format=parquet ``` Or watch this [awesome demo](https://www.youtube.com/watch?v=UYIC57hlL14) recorded by [@alamb](https://github.com/alamb) @@ -52,14 +52,14 @@ and the companion blog post in the [Datafusion blog](https://datafusion.apache.o ```shell # Create a scale factor 10 dataset in the native table format. -tpchgen-cli -s 10 --output-dir sf10 +spatialbench-cli -s 10 --output-dir sf10 # Create a scale factor 1 dataset in Parquet format. -tpchgen-cli -s 1 --output-dir sf1-parquet --format=parquet +spatialbench-cli -s 1 --output-dir sf1-parquet --format=parquet # Create a scale factor 1 (default) partitioned dataset for the region, nation, orders # and customer tables. -tpchgen-cli --tables region,nation,orders,customer --output-dir sf1-partitioned --parts 10 --part 2 +spatialbench-cli --tables region,nation,orders,customer --output-dir sf1-partitioned --parts 10 --part 2 # Create a scale factor 1 partitioned into separate folders. # @@ -74,7 +74,7 @@ tpchgen-cli --tables region,nation,orders,customer --output-dir sf1-partitioned # c235841b00d29ad4f817771fcc851207 part-2/region.tbl for PART in `seq 1 2`; do mkdir part-$PART - tpchgen-cli --tables region,nation,orders,customer --output-dir part-$PART --parts 10 --part $PART + spatialbench-cli --tables region,nation,orders,customer --output-dir part-$PART --parts 10 --part $PART done ``` @@ -98,7 +98,7 @@ Times to create TPCH tables in Parquet format using `tpchgen-cli` and `duckdb` f  -[`tpchgen-cli`](./tpchgen-cli/README.md) is more than 10x faster than the next +[`tpchgen-cli`](spatialbench-cli/README.md) is more than 10x faster than the next fastest TPCH generator we know of. On a 2023 Mac M3 Max laptop, it easily generates data faster than can be written to SSD. See [BENCHMARKS.md](./benchmarks/BENCHMARKS.md) for more details on performance and @@ -113,13 +113,13 @@ the output of this crate with [`dbgen`] as part of every checkin. See ## Crates -- [`tpchgen`](tpchgen): the core data generator logic for TPC-H. It has no +- [`tpchgen`](spatialbench): the core data generator logic for TPC-H. It has no dependencies and is easy to embed in other Rust project. -- [`tpchgen-arrow`](tpchgen-arrow) generates TPC-H data in [Apache Arrow] +- [`tpchgen-arrow`](spatialbench-arrow) generates TPC-H data in [Apache Arrow] format. It depends on the arrow-rs library -- [`tpchgen-cli`](tpchgen-cli) is a [`dbgen`] compatible CLI tool that generates +- [`tpchgen-cli`](spatialbench-cli) is a [`dbgen`] compatible CLI tool that generates benchmark dataset using multiple processes. [Apache Arrow]: https://arrow.apache.org/
