This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit a22cbeccd5b997694f9e0dd83fcb383715bd2928 Author: Pranav Toggi <[email protected]> AuthorDate: Wed Jul 16 19:18:16 2025 -0700 [EWT-2884] Encode spatial columns as WKB (#2) * encode geometries as WKB * update affine transform * update readme * extract affine as named constant * address comments; Remove conformance test suite * add seconds to time columns; add scale_factor for Zone generator; Fix doc tests. * fix lint * fix typo --- .github/workflows/conformance.yml | 47 ------- Cargo.toml | 4 + README.md | 2 +- tests/conformance.sh | 98 --------------- tpchgen-arrow/Cargo.toml | 3 +- tpchgen-arrow/src/building.rs | 17 ++- tpchgen-arrow/src/conversions.rs | 6 +- tpchgen-arrow/src/customer.rs | 28 ++--- tpchgen-arrow/src/driver.rs | 15 ++- tpchgen-arrow/src/lib.rs | 4 +- tpchgen-arrow/src/trip.rs | 22 ++-- tpchgen-arrow/src/vehicle.rs | 28 ++--- tpchgen-arrow/src/zone.rs | 21 +++- tpchgen-arrow/tests/reparse.rs | 101 +++++++++++---- tpchgen-cli/src/main.rs | 11 +- tpchgen/Cargo.toml | 2 + tpchgen/data/sf-0.001/customer.tbl.gz | Bin 9836 -> 0 bytes tpchgen/data/sf-0.001/lineitem.tbl.gz | Bin 194292 -> 0 bytes tpchgen/data/sf-0.001/nation.tbl.gz | Bin 987 -> 0 bytes tpchgen/data/sf-0.001/orders.tbl.gz | Bin 44920 -> 0 bytes tpchgen/data/sf-0.001/part.tbl.gz | Bin 6803 -> 0 bytes tpchgen/data/sf-0.001/partsupp.tbl.gz | Bin 28120 -> 0 bytes tpchgen/data/sf-0.001/region.tbl.gz | Bin 267 -> 0 bytes tpchgen/data/sf-0.001/supplier.tbl.gz | Bin 755 -> 0 bytes tpchgen/data/sf-0.01/customer.tbl.gz | Bin 90794 -> 0 bytes tpchgen/data/sf-0.01/lineitem.tbl.gz | Bin 2050610 -> 0 bytes tpchgen/data/sf-0.01/nation.tbl.gz | Bin 987 -> 0 bytes tpchgen/data/sf-0.01/orders.tbl.gz | Bin 454663 -> 0 bytes tpchgen/data/sf-0.01/part.tbl.gz | Bin 60233 -> 0 bytes tpchgen/data/sf-0.01/partsupp.tbl.gz | Bin 276455 -> 0 bytes tpchgen/data/sf-0.01/region.tbl.gz | Bin 267 -> 0 bytes tpchgen/data/sf-0.01/supplier.tbl.gz | Bin 6295 -> 0 bytes tpchgen/src/csv.rs | 38 +++--- tpchgen/src/dates.rs | 115 ++++++++++------- tpchgen/src/generators.rs | 225 ++++++++++++++++++---------------- tpchgen/src/lib.rs | 26 ++-- tpchgen/src/spider.rs | 105 ++++++++++++---- tpchgen/src/spider_presets.rs | 60 ++------- tpchgen/tests/integration_tests.rs | 3 - 39 files changed, 503 insertions(+), 478 deletions(-) diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml deleted file mode 100644 index 0ab6356..0000000 --- a/.github/workflows/conformance.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Conformance tests - -on: - push: - branches: ["main"] - paths-ignore: - - '**.md' - pull_request: - paths-ignore: - - '**.md' - workflow_dispatch: # Allow manual triggering - -env: - CARGO_TERM_COLOR: always - -jobs: - # Conformance tests (compare to existing implementation) - conformance-test: - name: TPC-H Conformance Tests - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Set up Rust - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - - - name: Cache Rust dependencies - uses: actions/cache@v3 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - restore-keys: | - ${{ runner.os }}-cargo- - - - name: Make conformance script executable - run: chmod +x ./tests/conformance.sh - - - name: Run Conformance Tests - run: ./tests/conformance.sh diff --git a/Cargo.toml b/Cargo.toml index 6de9042..c5433a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,5 +13,9 @@ readme = "README.md" repository = "https://github.com/clflushopt/tpchgen-rs" version = "1.1.0" +[workspace.dependencies] +geo = "0.30.0" +geozero = { version = "0.14.0", features = ["with-wkb", "with-geo"]} + [profile.release] strip = "debuginfo" diff --git a/README.md b/README.md index 32485da..92a4a60 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ for PART in $(seq 1 4); do done ``` -## SedonaBench Spider Data Generator +## SpatialBench Spider Data Generator SpatialBench includes a synthetic spatial data generator ([spider.rs](https://github.com/wherobots/sedona-spatialbench/blob/main/tpchgen/src/spider.rs)) for creating: - Points diff --git a/tests/conformance.sh b/tests/conformance.sh deleted file mode 100644 index ba0ddf0..0000000 --- a/tests/conformance.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash - -# This script runs conformance tests against `dbgen` official implementation -# for scale factors 0.001, 0.01 and 0.1 and 1. - -# Bash niceness. -# -# Exit immediately if a command exits with a non-zero status -set -e -# Treat unset variables as an error when substituting -set -u -# Consider the exit status of all commands in a pipeline -set -o pipefail -# Print commands before executing them -set -x - -# Scale factors to run against. -SCALE_FACTORS=("0.001" "0.01" "0.1" "1") - -# Define tables to compare, we compare all tables but be explicit about them. -TABLES=("nation" "region" "part" "supplier" "partsupp" "customer" "orders" "lineitem") - -# Build the Rust generator -echo "Building tpchgen-rs..." -cargo build --release - -# Run tests for each scale factor -for SF in "${SCALE_FACTORS[@]}"; do - echo "Testing scale factor ${SF}..." - - # Create output directories - RUST_DIR="/tmp/tpchgen-rs-${SF}" - C_DIR="/tmp/tpchgen-c-${SF}" - - rm -rf "${RUST_DIR}" "${C_DIR}" - mkdir -p "${RUST_DIR}" "${C_DIR}" - - # Generate data using Rust implementation - echo "Generating data with Rust implementation at SF=${SF}..." - cargo run --release --bin tpchgen-cli -- --scale-factor "${SF}" --output-dir "${RUST_DIR}" - - # Generate data using C implementation - echo "Generating data with C implementation at SF=${SF}..." - docker run -v "${C_DIR}:/data" --rm ghcr.io/scalytics/tpch-docker:main -vf -s "${SF}" - - # Compare files - DIFF_COUNT=0 - DIFF_ERRORS="" - - for TABLE in "${TABLES[@]}"; do - RUST_FILE="${RUST_DIR}/${TABLE}.tbl" - C_FILE="${C_DIR}/${TABLE}.tbl" - - # Ensure both files exist - if [[ ! -f "${RUST_FILE}" ]]; then - echo "ERROR: Rust implementation did not generate ${TABLE}.tbl" - DIFF_COUNT=$((DIFF_COUNT + 1)) - DIFF_ERRORS="${DIFF_ERRORS}\nMissing file: ${RUST_FILE}" - continue - fi - - if [[ ! -f "${C_FILE}" ]]; then - echo "ERROR: C implementation did not generate ${TABLE}.tbl" - DIFF_COUNT=$((DIFF_COUNT + 1)) - DIFF_ERRORS="${DIFF_ERRORS}\nMissing file: ${C_FILE}" - continue - fi - - # Compare files - echo "Comparing ${TABLE}.tbl..." - - if ! diff -q "${RUST_FILE}" "${C_FILE}" > /dev/null; then - echo "ERROR: ${TABLE}.tbl files differ!" - # Get a few sample differences - DIFF_SAMPLE=$(diff "${RUST_FILE}" "${C_FILE}" | head -n 10) - DIFF_COUNT=$((DIFF_COUNT + 1)) - DIFF_ERRORS="${DIFF_ERRORS}\nDifferences in ${TABLE}.tbl:\n${DIFF_SAMPLE}\n" - else - echo "SUCCESS: ${TABLE}.tbl files match!" - fi - done - - # Report results - echo "--------------------------------------------" - echo "Scale Factor ${SF} Results:" - if [[ ${DIFF_COUNT} -eq 0 ]]; then - echo "All tables match! ✅" - else - echo "${DIFF_COUNT} tables have differences! ❌" - echo -e "${DIFF_ERRORS}" - echo "Test failed for scale factor ${SF}" - exit 1 - fi - echo "--------------------------------------------" -done - -echo "All conformance tests passed successfully! ✅" -exit 0 \ No newline at end of file diff --git a/tpchgen-arrow/Cargo.toml b/tpchgen-arrow/Cargo.toml index 5242c61..8132204 100644 --- a/tpchgen-arrow/Cargo.toml +++ b/tpchgen-arrow/Cargo.toml @@ -11,7 +11,8 @@ license = "Apache-2.0" [dependencies] arrow = { version = "54.0.0", default-features = false, features = ["prettyprint"] } tpchgen = { path = "../tpchgen", version = "1.1.0" } - +geo = { workspace = true } +geozero = { workspace = true } [dev-dependencies] arrow-csv = "54.0.0" chrono = "0.4.39" diff --git a/tpchgen-arrow/src/building.rs b/tpchgen-arrow/src/building.rs index ab306bf..1814edd 100644 --- a/tpchgen-arrow/src/building.rs +++ b/tpchgen-arrow/src/building.rs @@ -1,7 +1,9 @@ use crate::conversions::string_view_array_from_display_iter; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; -use arrow::array::{Int64Array, RecordBatch, StringViewArray}; +use arrow::array::{BinaryArray, Int64Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use geo::Geometry; +use geozero::{CoordDimensions, ToWkb}; use std::sync::{Arc, LazyLock}; use tpchgen::generators::{BuildingGenerator, BuildingGeneratorIterator}; @@ -59,12 +61,17 @@ impl Iterator for BuildingArrow { let buildingkey = Int64Array::from_iter_values(rows.iter().map(|r| r.b_buildingkey)); let name = string_view_array_from_display_iter(rows.iter().map(|r| &r.b_name)); - let polygon_wkt = - StringViewArray::from_iter_values(rows.iter().map(|r| r.b_boundary.clone())); + + // Convert geo::Polygon to WKB binary format + let wkb_array = BinaryArray::from_iter_values(rows.iter().map(|r| { + Geometry::Polygon(r.b_boundary.clone()) + .to_wkb(CoordDimensions::xy()) + .unwrap() + })); let batch = RecordBatch::try_new( Arc::clone(self.schema()), - vec![Arc::new(buildingkey), Arc::new(name), Arc::new(polygon_wkt)], + vec![Arc::new(buildingkey), Arc::new(name), Arc::new(wkb_array)], ) .unwrap(); Some(batch) @@ -77,6 +84,6 @@ fn make_building_schema() -> SchemaRef { Arc::new(Schema::new(vec![ Field::new("b_buildingkey", DataType::Int64, false), Field::new("b_name", DataType::Utf8View, false), - Field::new("b_boundary", DataType::Utf8View, false), + Field::new("b_boundary", DataType::Binary, false), ])) } diff --git a/tpchgen-arrow/src/conversions.rs b/tpchgen-arrow/src/conversions.rs index 46a33ba..b55ef4d 100644 --- a/tpchgen-arrow/src/conversions.rs +++ b/tpchgen-arrow/src/conversions.rs @@ -78,13 +78,13 @@ mod tests { #[test] fn test_to_arrow_date32() { - let value = TPCHDate::new(MIN_GENERATE_DATE, 0, 0); + let value = TPCHDate::new(MIN_GENERATE_DATE, 0, 0, 0); assert_eq!(to_arrow_date32(value), 8035); - let value = TPCHDate::new(MIN_GENERATE_DATE + 100, 0, 0); + let value = TPCHDate::new(MIN_GENERATE_DATE + 100, 0, 0, 0); assert_eq!(to_arrow_date32(value), 8135); - let value = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0); + let value = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0, 0); assert_eq!(to_arrow_date32(value), 9269); } } diff --git a/tpchgen-arrow/src/customer.rs b/tpchgen-arrow/src/customer.rs index 2485c44..4e2e6be 100644 --- a/tpchgen-arrow/src/customer.rs +++ b/tpchgen-arrow/src/customer.rs @@ -26,20 +26,20 @@ use tpchgen::generators::{CustomerGenerator, CustomerGeneratorIterator}; /// .to_string(); /// let lines = formatted_batches.lines().collect::<Vec<_>>(); /// assert_eq!(lines, vec![ -/// "+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+", -/// "| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment |", -/// "+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+", -/// "| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e |", -/// "| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref |", -/// "| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov |", -/// "| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou |", -/// "| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor |", -/// "| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious |", -/// "| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp |", -/// "| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide |", -/// "| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl |", -/// "| 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 5 | 15-741-346-9870 | 2753.54 | HOUSEHOLD | es regular deposits haggle. fur |", -/// "+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+", +/// "+-----------+--------------------+---------------------------------------+-------------+--------------+-----------------+", +/// "| c_custkey | c_name | c_address | c_region | c_nation | c_phone |", +/// "+-----------+--------------------+---------------------------------------+-------------+--------------+-----------------+", +/// "| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | AFRICA | MOROCCO | 25-989-741-2988 |", +/// "| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | MIDDLE EAST | JORDAN | 23-768-687-3665 |", +/// "| 3 | Customer#000000003 | MG9kdTD2WBHm | AMERICA | ARGENTINA | 11-719-748-3364 |", +/// "| 4 | Customer#000000004 | XxVSJsLAGtn | MIDDLE EAST | EGYPT | 14-128-190-5944 |", +/// "| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | AMERICA | CANADA | 13-750-942-6364 |", +/// "| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | MIDDLE EAST | SAUDI ARABIA | 30-114-968-4951 |", +/// "| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | ASIA | CHINA | 28-190-982-9759 |", +/// "| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | AMERICA | PERU | 27-147-574-9335 |", +/// "| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | ASIA | INDIA | 18-338-906-3675 |", +/// "| 10 | Customer#000000010 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | AFRICA | ETHIOPIA | 15-741-346-9870 |", +/// "+-----------+--------------------+---------------------------------------+-------------+--------------+-----------------+" /// ]); /// ``` pub struct CustomerArrow { diff --git a/tpchgen-arrow/src/driver.rs b/tpchgen-arrow/src/driver.rs index a0c2bf3..af760a4 100644 --- a/tpchgen-arrow/src/driver.rs +++ b/tpchgen-arrow/src/driver.rs @@ -26,7 +26,20 @@ use tpchgen::generators::{DriverGenerator, DriverGeneratorIterator}; /// .to_string(); /// let lines = formatted_batches.lines().collect::<Vec<_>>(); /// assert_eq!(lines, vec![ -/// "+-----------+--------------------+-------------------------------------+-------------+-----------------+-----------+-----------------------------------------------------------------------------------------------------+", "| s_suppkey | s_name | s_address | s_nationkey | s_phone | s_acctbal | s_comment |", "+-----------+--------------------+------- [...] +/// "+-------------+------------------+-------------------------------------+-------------+----------------+-----------------+", +/// "| d_driverkey | d_name | d_address | d_region | d_nation | d_phone |", +/// "+-------------+------------------+-------------------------------------+-------------+----------------+-----------------+", +/// "| 1 | Driver#000000001 | N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ | AMERICA | PERU | 27-918-335-1736 |", +/// "| 2 | Driver#000000002 | 89eJ5ksX3ImxJQBvxObC, | AFRICA | ETHIOPIA | 15-679-861-2259 |", +/// "| 3 | Driver#000000003 | q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3 | AMERICA | ARGENTINA | 11-383-516-1199 |", +/// "| 4 | Driver#000000004 | Bk7ah4CK8SYQTepEmvMkkgMwg | AFRICA | MOROCCO | 25-843-787-7479 |", +/// "| 5 | Driver#000000005 | Gcdm2rJRzl5qlTVzc | MIDDLE EAST | IRAQ | 21-151-690-3663 |", +/// "| 6 | Driver#000000006 | tQxuVm7s7CnK | AFRICA | KENYA | 24-696-997-4969 |", +/// "| 7 | Driver#000000007 | s,4TicNGB4uO6PaSqNBUq | EUROPE | UNITED KINGDOM | 33-990-965-2201 |", +/// "| 8 | Driver#000000008 | 9Sq4bBH2FQEmaFOocY45sRTxo6yuoG | AMERICA | PERU | 27-498-742-3860 |", +/// "| 9 | Driver#000000009 | 1KhUgZegwM3ua7dsYmekYBsK | MIDDLE EAST | IRAN | 20-403-398-8662 |", +/// "| 10 | Driver#000000010 | Saygah3gYWMp72i PY | AMERICA | UNITED STATES | 34-852-489-8585 |", +/// "+-------------+------------------+-------------------------------------+-------------+----------------+-----------------+" /// ]); /// ``` pub struct DriverArrow { diff --git a/tpchgen-arrow/src/lib.rs b/tpchgen-arrow/src/lib.rs index 24e045d..aca6367 100644 --- a/tpchgen-arrow/src/lib.rs +++ b/tpchgen-arrow/src/lib.rs @@ -5,7 +5,7 @@ //! files and then parsing them into Arrow. //! //! # Example -//! ``` + // # use tpchgen::generators::LineItemGenerator; // # use tpchgen_arrow::LineItemArrow; // # use arrow::util::pretty::pretty_format_batches; @@ -33,7 +33,7 @@ // "| 3 | 128449 | 3474 | 3 | 27.00 | 39890.88 | 0.06 | 0.07 | A | F | 1994-01-16 | 1993-11-22 | 1994-01-23 | DELIVER IN PERSON | SHIP | nal foxes wake. |", // "+------------+-----------+-----------+--------------+------------+-----------------+------------+-------+--------------+--------------+------------+--------------+---------------+-------------------+------------+-------------------------------------+" // ]); -//! ``` + mod building; pub mod conversions; mod customer; diff --git a/tpchgen-arrow/src/trip.rs b/tpchgen-arrow/src/trip.rs index f3aca1c..a4fc7ce 100644 --- a/tpchgen-arrow/src/trip.rs +++ b/tpchgen-arrow/src/trip.rs @@ -1,7 +1,9 @@ use crate::conversions::{decimal128_array_from_iter, to_arrow_date32}; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; -use arrow::array::{Date32Array, Int64Array, RecordBatch, StringViewArray}; +use arrow::array::{BinaryArray, Date32Array, Int64Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use geo::Geometry; +use geozero::{CoordDimensions, ToWkb}; use std::sync::{Arc, LazyLock, Mutex}; use tpchgen::generators::{Trip, TripGenerator, TripGeneratorIterator}; @@ -81,10 +83,16 @@ impl Iterator for TripArrow { let t_tip = decimal128_array_from_iter(rows.iter().map(|row| row.t_tip)); let t_totalamount = decimal128_array_from_iter(rows.iter().map(|row| row.t_totalamount)); let t_distance = decimal128_array_from_iter(rows.iter().map(|row| row.t_distance)); - let t_pickuploc = - StringViewArray::from_iter_values(rows.iter().map(|row| row.t_pickuploc.clone())); - let t_dropoffloc = - StringViewArray::from_iter_values(rows.iter().map(|row| row.t_dropoffloc.clone())); + let t_pickuploc = BinaryArray::from_iter_values(rows.iter().map(|row| { + Geometry::Point(row.t_pickuploc) + .to_wkb(CoordDimensions::xy()) + .expect("Failed to convert pickup location to WKB") + })); + let t_dropoffloc = BinaryArray::from_iter_values(rows.iter().map(|row| { + Geometry::Point(row.t_dropoffloc) + .to_wkb(CoordDimensions::xy()) + .expect("Failed to convert dropoff location to WKB") + })); let batch = RecordBatch::try_new( Arc::clone(&self.schema), @@ -124,7 +132,7 @@ fn make_trip_schema() -> SchemaRef { Field::new("t_tip", DataType::Decimal128(15, 5), false), Field::new("t_totalamount", DataType::Decimal128(15, 5), false), Field::new("t_distance", DataType::Decimal128(15, 5), false), - Field::new("t_pickuploc", DataType::Utf8View, false), - Field::new("t_dropoffloc", DataType::Utf8View, false), + Field::new("t_pickuploc", DataType::Binary, false), + Field::new("t_dropoffloc", DataType::Binary, false), ])) } diff --git a/tpchgen-arrow/src/vehicle.rs b/tpchgen-arrow/src/vehicle.rs index a244d83..bc09cda 100644 --- a/tpchgen-arrow/src/vehicle.rs +++ b/tpchgen-arrow/src/vehicle.rs @@ -26,20 +26,20 @@ use tpchgen::generators::{VehicleGenerator, VehicleGeneratorIterator}; /// .to_string(); /// let lines = formatted_batches.lines().collect::<Vec<_>>(); /// assert_eq!(lines, vec![ -/// "+-----------+------------------------------------------+----------------+----------+-------------------------+--------+-------------+---------------+----------------------+", -/// "| v_vehiclekey | v_name | v_mfgr | v_brand | v_type | v_size | v_container | v_retailprice | v_comment |", -/// "+-----------+------------------------------------------+----------------+----------+-------------------------+--------+-------------+---------------+----------------------+", -/// "| 1 | goldenrod lavender spring chocolate lace | Manufacturer#1 | Brand#13 | PROMO BURNISHED COPPER | 7 | JUMBO PKG | 901.00 | ly. slyly ironi |", -/// "| 2 | blush thistle blue yellow saddle | Manufacturer#1 | Brand#13 | LARGE BRUSHED BRASS | 1 | LG CASE | 902.00 | lar accounts amo |", -/// "| 3 | spring green yellow purple cornsilk | Manufacturer#4 | Brand#42 | STANDARD POLISHED BRASS | 21 | WRAP CASE | 903.00 | egular deposits hag |", -/// "| 4 | cornflower chocolate smoke green pink | Manufacturer#3 | Brand#34 | SMALL PLATED BRASS | 14 | MED DRUM | 904.00 | p furiously r |", -/// "| 5 | forest brown coral puff cream | Manufacturer#3 | Brand#32 | STANDARD POLISHED TIN | 15 | SM PKG | 905.00 | wake carefully |", -/// "| 6 | bisque cornflower lawn forest magenta | Manufacturer#2 | Brand#24 | PROMO PLATED STEEL | 4 | MED BAG | 906.00 | sual a |", -/// "| 7 | moccasin green thistle khaki floral | Manufacturer#1 | Brand#11 | SMALL PLATED COPPER | 45 | SM BAG | 907.00 | lyly. ex |", -/// "| 8 | misty lace thistle snow royal | Manufacturer#4 | Brand#44 | PROMO BURNISHED TIN | 41 | LG DRUM | 908.00 | eposi |", -/// "| 9 | thistle dim navajo dark gainsboro | Manufacturer#4 | Brand#43 | SMALL BURNISHED STEEL | 12 | WRAP CASE | 909.00 | ironic foxe |", -/// "| 10 | linen pink saddle puff powder | Manufacturer#5 | Brand#54 | LARGE BURNISHED STEEL | 44 | LG CAN | 910.01 | ithely final deposit |", -/// "+-----------+------------------------------------------+----------------+----------+-------------------------+--------+-------------+---------------+----------------------+" +/// "+--------------+----------------+----------+-------------------------+----------------------+", +/// "| v_vehiclekey | v_mfgr | v_brand | v_type | v_comment |", +/// "+--------------+----------------+----------+-------------------------+----------------------+", +/// "| 1 | Manufacturer#1 | Brand#13 | PROMO BURNISHED COPPER | ly. slyly ironi |", +/// "| 2 | Manufacturer#1 | Brand#13 | LARGE BRUSHED BRASS | lar accounts amo |", +/// "| 3 | Manufacturer#4 | Brand#42 | STANDARD POLISHED BRASS | egular deposits hag |", +/// "| 4 | Manufacturer#3 | Brand#34 | SMALL PLATED BRASS | p furiously r |", +/// "| 5 | Manufacturer#3 | Brand#32 | STANDARD POLISHED TIN | wake carefully |", +/// "| 6 | Manufacturer#2 | Brand#24 | PROMO PLATED STEEL | sual a |", +/// "| 7 | Manufacturer#1 | Brand#11 | SMALL PLATED COPPER | lyly. ex |", +/// "| 8 | Manufacturer#4 | Brand#44 | PROMO BURNISHED TIN | eposi |", +/// "| 9 | Manufacturer#4 | Brand#43 | SMALL BURNISHED STEEL | ironic foxe |", +/// "| 10 | Manufacturer#5 | Brand#54 | LARGE BURNISHED STEEL | ithely final deposit |", +/// "+--------------+----------------+----------+-------------------------+----------------------+" /// ]); /// ``` pub struct VehicleArrow { diff --git a/tpchgen-arrow/src/zone.rs b/tpchgen-arrow/src/zone.rs index 42ec81b..9b49abf 100644 --- a/tpchgen-arrow/src/zone.rs +++ b/tpchgen-arrow/src/zone.rs @@ -1,7 +1,8 @@ use crate::conversions::string_view_array_from_display_iter; use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; -use arrow::array::{Int64Array, RecordBatch}; +use arrow::array::{BinaryArray, Int64Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use geozero::{CoordDimensions, ToWkb}; use std::sync::{Arc, LazyLock}; use tpchgen::generators::{ZoneGenerator, ZoneGeneratorIterator}; @@ -15,7 +16,7 @@ use tpchgen::generators::{ZoneGenerator, ZoneGeneratorIterator}; /// # use tpchgen_arrow::ZoneArrow; /// /// // Create a SF=1.0 generator and wrap it in an Arrow generator -/// let generator = ZoneGenerator::new(1.0, 1, 1); +/// let generator = ZoneGenerator::new(0.001, 1, 1); /// let mut arrow_generator = ZoneArrow::new(generator) /// .with_batch_size(10); /// // Read the first 10 batches @@ -63,15 +64,25 @@ impl Iterator for ZoneArrow { let z_zonekey = Int64Array::from_iter_values(rows.iter().map(|r| r.z_zonekey)); let z_gersid = string_view_array_from_display_iter(rows.iter().map(|r| &r.z_gersid)); + let z_country = string_view_array_from_display_iter(rows.iter().map(|r| &r.z_country)); + let z_region = string_view_array_from_display_iter(rows.iter().map(|r| &r.z_region)); let z_name = string_view_array_from_display_iter(rows.iter().map(|r| &r.z_name)); let z_subtype = string_view_array_from_display_iter(rows.iter().map(|r| &r.z_subtype)); - let z_boundary = string_view_array_from_display_iter(rows.iter().map(|r| &r.z_boundary)); + + // Convert geo::Polygon to WKB binary format + let z_boundary = BinaryArray::from_iter_values(rows.iter().map(|r| { + r.z_boundary + .to_wkb(CoordDimensions::xy()) + .expect("Failed to encode WKB") + })); let batch = RecordBatch::try_new( Arc::clone(self.schema()), vec![ Arc::new(z_zonekey), Arc::new(z_gersid), + Arc::new(z_country), + Arc::new(z_region), Arc::new(z_name), Arc::new(z_subtype), Arc::new(z_boundary), @@ -88,8 +99,10 @@ fn make_zone_schema() -> SchemaRef { Arc::new(Schema::new(vec![ Field::new("z_zonekey", DataType::Int64, false), Field::new("z_gersid", DataType::Utf8View, false), + Field::new("z_country", DataType::Utf8View, false), + Field::new("z_region", DataType::Utf8View, false), Field::new("z_name", DataType::Utf8View, false), Field::new("z_subtype", DataType::Utf8View, false), - Field::new("z_boundary", DataType::Utf8View, false), + Field::new("z_boundary", DataType::Binary, false), ])) } diff --git a/tpchgen-arrow/tests/reparse.rs b/tpchgen-arrow/tests/reparse.rs index 6d35d4a..ad1ef54 100644 --- a/tpchgen-arrow/tests/reparse.rs +++ b/tpchgen-arrow/tests/reparse.rs @@ -15,6 +15,12 @@ use tpchgen_arrow::{ ZoneArrow, }; +use arrow::array::Array; +use arrow::array::BinaryArray; +use geo::Geometry; +use geozero::ToGeo; +use geozero::wkb::Wkb; + /// Macro that defines tests for tbl for a given type macro_rules! test_row_type { ($FUNCNAME:ident, $GENERATOR:ty, $ARROWITER:ty, $FORMATTYPE:expr) => { @@ -90,58 +96,80 @@ enum Test { } impl Test { - /// Create a test for TBL format fn tbl() -> Self { Self::TBL } - /// Create a test for CSV format fn csv() -> Self { Self::CSV } - /// Generates data using the row iterator and the arrow iterator using the - /// specified format and compares the results of parsing with the Arrow CSV - /// parser with the directly generated the batches fn test<R, RI, RBI>(&self, mut row_iter: RI, mut arrow_iter: RBI) where R: RowType, RI: Iterator<Item = R>, RBI: RecordBatchIterator, { - // For each batch generated by the arrow iterator + let schema = Arc::clone(arrow_iter.schema()); + + // Check for unsupported types for reparsing + let contains_binary = schema + .fields() + .iter() + .any(|f| matches!(f.data_type(), arrow::datatypes::DataType::Binary)); + while let Some(arrow_batch) = arrow_iter.next() { - // Generate the same number of rows using the row iterator - // in the specified format let batch_size = arrow_batch.num_rows(); + + for (i, field) in arrow_batch.schema().fields().iter().enumerate() { + if let arrow::datatypes::DataType::Binary = field.data_type() { + let col = arrow_batch.column(i); + let bin_array = col + .as_any() + .downcast_ref::<arrow::array::BinaryArray>() + .expect("Expected BinaryArray"); + + let expected_geoms = match field.name().as_str() { + "t_pickuploc" | "t_dropoffloc" => &["Point"][..], + "b_boundary" => &["Polygon"][..], + "z_boundary" => &["Polygon", "MultiPolygon"][..], + _ => &["Unknown"][..], + }; + + validate_wkb_column(bin_array, expected_geoms); + } + } + + // Skip reparsing for Binary-containing schemas + if contains_binary { + eprintln!( + "Skipping reparsing test for schema with Binary type: {:?}", + schema + ); + continue; + } + let mut text_data = Vec::new(); self.write_header::<R>(&mut text_data); row_iter.by_ref().take(batch_size).for_each(|item| { self.write_row(item, &mut text_data); }); - // reparse the generated data and compare with the arrow batch - let reparsed_batch = self.parse(&text_data, arrow_iter.schema(), batch_size); + + let reparsed_batch = self.parse(&text_data, &schema, batch_size); assert_eq!(reparsed_batch, arrow_batch); } } - /// Write the header for the row type fn write_header<R: RowType>(&self, text_data: &mut Vec<u8>) { - match self { - Test::TBL => {} - Test::CSV => { - R::write_csv_header(text_data); - } + if let Test::CSV = self { + R::write_csv_header(text_data); } } - /// Write a row into the provided buffer fn write_row<R: RowType>(&self, row: R, text_data: &mut Vec<u8>) { match self { Test::TBL => { row.write_tbl_row(text_data); - // Note: TBL lines end with '|' which the arrow csv parser treats as a - // delimiter for a new column so replace the last '|' with a newline let end_offset = text_data.len() - 1; text_data[end_offset] = b'\n'; } @@ -151,18 +179,15 @@ impl Test { } } - /// Parse the provided data into an Arrow RecordBatch fn parse(&self, data: &[u8], schema: &SchemaRef, batch_size: usize) -> RecordBatch { let builder = arrow_csv::reader::ReaderBuilder::new(Arc::clone(schema)).with_batch_size(batch_size); - let builder = match self { Test::TBL => builder.with_header(false).with_delimiter(b'|'), Test::CSV => builder.with_header(true), }; let mut parser = builder.build(data).unwrap(); - let batch = parser .next() .expect("should have a batch") @@ -171,3 +196,35 @@ impl Test { batch } } + +fn validate_wkb_column(array: &BinaryArray, expected_types: &[&str]) { + for i in 0..array.len() { + if array.is_null(i) { + panic!("Unexpected null geometry at row {i}"); + } + + let bytes = array.value(i); + let geom = Wkb(bytes.to_vec()).to_geo().unwrap_or_else(|err| { + panic!("Row {i}: Failed to parse WKB: {err}"); + }); + + let type_name = match &geom { + Geometry::Point(_) => "Point", + Geometry::Line(_) => "Line", + Geometry::LineString(_) => "LineString", + Geometry::Polygon(_) => "Polygon", + Geometry::MultiPoint(_) => "MultiPoint", + Geometry::MultiLineString(_) => "MultiLineString", + Geometry::MultiPolygon(_) => "MultiPolygon", + Geometry::GeometryCollection(_) => "GeometryCollection", + _ => "Unknown", // Catch Rect, Triangle + }; + + assert!( + expected_types.contains(&type_name), + "Row {i}: Unexpected geometry type: got {}, expected one of {:?}", + type_name, + expected_types + ); + } +} diff --git a/tpchgen-cli/src/main.rs b/tpchgen-cli/src/main.rs index e708789..0539fbc 100644 --- a/tpchgen-cli/src/main.rs +++ b/tpchgen-cli/src/main.rs @@ -289,6 +289,7 @@ impl Cli { Table::Customer, Table::Trip, Table::Building, + Table::Zone, ] }; @@ -399,23 +400,23 @@ impl Cli { // of the table in tbl format let (avg_row_size_bytes, row_count) = match table { Table::Vehicle => ( - 115, + 64, VehicleGenerator::calculate_row_count(self.scale_factor, 1, 1), ), Table::Driver => ( - 140, + 80, DriverGenerator::calculate_row_count(self.scale_factor, 1, 1), ), Table::Customer => ( - 160, + 84, CustomerGenerator::calculate_row_count(self.scale_factor, 1, 1), ), &Table::Trip => ( - 130, + 144, TripGenerator::calculate_row_count(self.scale_factor, 1, 1), ), Table::Building => ( - 115, + 212, BuildingGenerator::calculate_row_count(self.scale_factor, 1, 1), ), Table::Zone => { diff --git a/tpchgen/Cargo.toml b/tpchgen/Cargo.toml index ff3c58a..d20ea7a 100644 --- a/tpchgen/Cargo.toml +++ b/tpchgen/Cargo.toml @@ -14,6 +14,8 @@ license = { workspace = true } [dependencies] rand = { version = "0.8", features = ["small_rng"] } duckdb = { version = "1.3.0", features = ["bundled"] } +geo = { workspace = true } +geozero = { workspace = true } [dev-dependencies] flate2 = "1.1.0" diff --git a/tpchgen/data/sf-0.001/customer.tbl.gz b/tpchgen/data/sf-0.001/customer.tbl.gz deleted file mode 100644 index 5ef8fe5..0000000 Binary files a/tpchgen/data/sf-0.001/customer.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.001/lineitem.tbl.gz b/tpchgen/data/sf-0.001/lineitem.tbl.gz deleted file mode 100644 index f3b4fcf..0000000 Binary files a/tpchgen/data/sf-0.001/lineitem.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.001/nation.tbl.gz b/tpchgen/data/sf-0.001/nation.tbl.gz deleted file mode 100644 index 49e9452..0000000 Binary files a/tpchgen/data/sf-0.001/nation.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.001/orders.tbl.gz b/tpchgen/data/sf-0.001/orders.tbl.gz deleted file mode 100644 index 206da24..0000000 Binary files a/tpchgen/data/sf-0.001/orders.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.001/part.tbl.gz b/tpchgen/data/sf-0.001/part.tbl.gz deleted file mode 100644 index 00396dd..0000000 Binary files a/tpchgen/data/sf-0.001/part.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.001/partsupp.tbl.gz b/tpchgen/data/sf-0.001/partsupp.tbl.gz deleted file mode 100644 index 8b38aab..0000000 Binary files a/tpchgen/data/sf-0.001/partsupp.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.001/region.tbl.gz b/tpchgen/data/sf-0.001/region.tbl.gz deleted file mode 100644 index 6898e52..0000000 Binary files a/tpchgen/data/sf-0.001/region.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.001/supplier.tbl.gz b/tpchgen/data/sf-0.001/supplier.tbl.gz deleted file mode 100644 index f15742a..0000000 Binary files a/tpchgen/data/sf-0.001/supplier.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/customer.tbl.gz b/tpchgen/data/sf-0.01/customer.tbl.gz deleted file mode 100644 index 38383c8..0000000 Binary files a/tpchgen/data/sf-0.01/customer.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/lineitem.tbl.gz b/tpchgen/data/sf-0.01/lineitem.tbl.gz deleted file mode 100644 index f4b1c22..0000000 Binary files a/tpchgen/data/sf-0.01/lineitem.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/nation.tbl.gz b/tpchgen/data/sf-0.01/nation.tbl.gz deleted file mode 100644 index 635686b..0000000 Binary files a/tpchgen/data/sf-0.01/nation.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/orders.tbl.gz b/tpchgen/data/sf-0.01/orders.tbl.gz deleted file mode 100644 index b7cbad1..0000000 Binary files a/tpchgen/data/sf-0.01/orders.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/part.tbl.gz b/tpchgen/data/sf-0.01/part.tbl.gz deleted file mode 100644 index cf742e5..0000000 Binary files a/tpchgen/data/sf-0.01/part.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/partsupp.tbl.gz b/tpchgen/data/sf-0.01/partsupp.tbl.gz deleted file mode 100644 index 65bea8b..0000000 Binary files a/tpchgen/data/sf-0.01/partsupp.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/region.tbl.gz b/tpchgen/data/sf-0.01/region.tbl.gz deleted file mode 100644 index 030f89c..0000000 Binary files a/tpchgen/data/sf-0.01/region.tbl.gz and /dev/null differ diff --git a/tpchgen/data/sf-0.01/supplier.tbl.gz b/tpchgen/data/sf-0.01/supplier.tbl.gz deleted file mode 100644 index d270603..0000000 Binary files a/tpchgen/data/sf-0.01/supplier.tbl.gz and /dev/null differ diff --git a/tpchgen/src/csv.rs b/tpchgen/src/csv.rs index 5521d87..946fdeb 100644 --- a/tpchgen/src/csv.rs +++ b/tpchgen/src/csv.rs @@ -21,10 +21,10 @@ use std::fmt::Display; /// } /// assert_eq!( /// csv, -/// "v_vehiclekey,v_name,v_mfgr,v_brand,v_type,v_size,v_container,v_retailprice,v_comment\n\ -/// 1,goldenrod lavender spring chocolate lace,Manufacturer#1,Brand#13,PROMO BURNISHED COPPER,7,JUMBO PKG,901.00,\"ly. slyly ironi\"\n\ -/// 2,blush thistle blue yellow saddle,Manufacturer#1,Brand#13,LARGE BRUSHED BRASS,1,LG CASE,902.00,\"lar accounts amo\"\n\ -/// 3,spring green yellow purple cornsilk,Manufacturer#4,Brand#42,STANDARD POLISHED BRASS,21,WRAP CASE,903.00,\"egular deposits hag\"\n" +/// "v_vehiclekey,v_mfgr,v_brand,v_type, v_licence\n\ +/// 1,Manufacturer#1,Brand#13,PROMO BURNISHED COPPER,\"ly. slyly ironi\"\n\ +/// 2,Manufacturer#1,Brand#13,LARGE BRUSHED BRASS,\"lar accounts amo\"\n\ +/// 3,Manufacturer#4,Brand#42,STANDARD POLISHED BRASS,\"egular deposits hag\"\n" /// ); /// ``` pub struct VehicleCsv<'a> { @@ -74,10 +74,10 @@ impl Display for VehicleCsv<'_> { /// } /// assert_eq!( /// csv, -/// "s_suppkey,s_name,s_address,s_nationkey,s_phone,s_acctbal,s_comment\n\ -/// 1,Driver#000000001,\" N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ\",17,27-918-335-1736,5755.94,\"each slyly above the careful\"\n\ -/// 2,Driver#000000002,\"89eJ5ksX3ImxJQBvxObC,\",5,15-679-861-2259,4032.68,\" slyly bold instructions. idle dependen\"\n\ -/// 3,Driver#000000003,\"q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3\",1,11-383-516-1199,4192.40,\"blithely silent requests after the express dependencies are sl\"\n" +/// "d_driverkey,d_name,d_address,d_region,d_nation,d_phone\n\ +/// 1,Driver#000000001,\" N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ\",AMERICA,PERU,27-918-335-1736\n\ +/// 2,Driver#000000002,\"89eJ5ksX3ImxJQBvxObC,\",AFRICA,ETHIOPIA,15-679-861-2259\n\ +/// 3,Driver#000000003,\"q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3\",AMERICA,ARGENTINA,11-383-516-1199\n" /// ); /// ``` pub struct DriverCsv { @@ -128,10 +128,10 @@ impl Display for DriverCsv { /// } /// assert_eq!( /// csv, -/// "c_custkey,c_name,c_address,c_nationkey,c_phone,c_acctbal,c_mktsegment,c_comment\n\ -/// 1,Customer#000000001,\"IVhzIApeRb ot,c,E\",15,25-989-741-2988,711.56,BUILDING,\"to the even, regular platelets. regular, ironic epitaphs nag e\"\n\ -/// 2,Customer#000000002,\"XSTf4,NCwDVaWNe6tEgvwfmRchLXak\",13,23-768-687-3665,121.65,AUTOMOBILE,\"l accounts. blithely ironic theodolites integrate boldly: caref\"\n\ -/// 3,Customer#000000003,\"MG9kdTD2WBHm\",1,11-719-748-3364,7498.12,AUTOMOBILE,\" deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov\"\n" +/// "c_custkey,c_name,c_address,c_region,c_nation,c_phone\n\ +/// 1,Customer#000000001,\"IVhzIApeRb ot,c,E\",AFRICA,MOROCCO,25-989-741-2988\n\ +/// 2,Customer#000000002,\"XSTf4,NCwDVaWNe6tEgvwfmRchLXak\",MIDDLE EAST,JORDAN,23-768-687-3665\n\ +/// 3,Customer#000000003,\"MG9kdTD2WBHm\",AMERICA,ARGENTINA,11-719-748-3364\n" /// ); /// ``` pub struct CustomerCsv<'a> { @@ -201,7 +201,7 @@ impl Display for TripCsv { write!( f, // note must quote location and comment fields as they may contain commas - "{},{},{},{},{},{},{},{},{},{},{},{}", + "{},{},{},{},{},{},{},{},{},{},\"{:?}\",\"{:?}\"", self.inner.t_tripkey, self.inner.t_custkey, self.inner.t_driverkey, @@ -254,7 +254,7 @@ impl Display for BuildingCsv<'_> { write!( f, // note must quote the comment field as it may contain commas - "{},{},\"{}\"", + "{},{},\"{:?}\"", self.inner.b_buildingkey, self.inner.b_name, self.inner.b_boundary, ) } @@ -268,7 +268,7 @@ impl Display for BuildingCsv<'_> { /// # use tpchgen::csv::ZoneCsv; /// # use std::fmt::Write; /// // Output the first 3 rows in CSV format -/// let generator = ZoneGenerator::new(1.0, 1, 1); +/// let generator = ZoneGenerator::new(0.001, 1, 1); /// let mut csv = String::new(); /// writeln!(&mut csv, "{}", ZoneCsv::header()).unwrap(); // write header /// for line in generator.iter().take(3) { @@ -287,7 +287,7 @@ impl ZoneCsv { /// Returns the CSV header for the Zone table pub fn header() -> &'static str { - "z_zonekey,z_gersid,z_name,z_subtype,z_boundary" + "z_zonekey,z_gersid,z_country,z_region,z_name,z_subtype,z_boundary" } } @@ -295,12 +295,14 @@ impl Display for ZoneCsv { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "{},{},{},{},{}", + "{},{},{},{},{},{},\"{:?}\"", self.inner.z_zonekey, self.inner.z_gersid, + self.inner.z_country, + self.inner.z_region, self.inner.z_name, self.inner.z_subtype, - self.inner.z_boundary + self.inner.z_boundary, ) } } diff --git a/tpchgen/src/dates.rs b/tpchgen/src/dates.rs index 217dd4f..c07b1c5 100644 --- a/tpchgen/src/dates.rs +++ b/tpchgen/src/dates.rs @@ -1,4 +1,6 @@ //! [`TPCHDate`] and date handling +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use std::{ fmt::{Display, Formatter}, sync::LazyLock, @@ -94,6 +96,44 @@ impl GenerateUtils { } } +/// Random time generator that produces hours and minutes +#[derive(Debug, Clone)] +pub struct RandomTimeOfDay { + rng: StdRng, +} + +impl RandomTimeOfDay { + /// Creates a new RandomTimeOfDay generator with the given seed + pub fn new(seed: u64) -> Self { + RandomTimeOfDay { + rng: StdRng::seed_from_u64(seed), + } + } + + /// Generates a random time of day as (hour, minute, second) + pub fn next_value(&mut self) -> (u8, u8, u8) { + let hour = self.rng.gen_range(0..24); + let minute = self.rng.gen_range(0..60); + let second = self.rng.gen_range(0..60); + (hour, minute, second) + } + + /// Advances the generator by a given number of rows + pub fn advance_rows(&mut self, rows: i64) { + for _ in 0..rows { + self.next_value(); + } + } + + /// Mark this row as finished + /// + /// This is a no-op for this generator since it doesn't need row-specific state tracking + /// but is required to match the interface pattern used by other random generators + pub fn row_finished(&mut self) { + // No operation needed - StdRng doesn't require row-based state management + } +} + /// Represents a date (day/year) /// /// Example display: 1992-01-01 @@ -103,26 +143,26 @@ impl GenerateUtils { /// # Example /// ``` /// # use tpchgen::dates::{TPCHDate, MIN_GENERATE_DATE}; -/// let date = TPCHDate::new(MIN_GENERATE_DATE + 41, 0, 0); +/// let date = TPCHDate::new(MIN_GENERATE_DATE + 41, 0, 0, 0); /// // Convert the date to y/m/d fields /// assert_eq!((92,2,11), date.to_ymd()); /// // format as a string using the Display impl -/// assert_eq!("1992-02-11", date.to_string()); +/// assert_eq!("1992-02-11 00:00:00", date.to_string()); /// ``` -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)] pub struct TPCHDate { - /// date index (0 based) from MIN_GENERATE_DATE date_index: i32, hour: u8, // 0-23 minute: u8, // 0-59 + second: u8, } impl Display for TPCHDate { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, - "{} {:02}:{:02}", - &DATE_TO_STRING[self.date_index as usize], self.hour, self.minute + "{} {:02}:{:02}:{:02}", + &DATE_TO_STRING[self.date_index as usize], self.hour, self.minute, self.second ) } } @@ -138,34 +178,18 @@ impl TPCHDate { pub const UNIX_EPOCH_OFFSET: i32 = 8035; /// Create a new TPCHDate from a generated date - pub fn new(generated_date: i32, hour: u8, minute: u8) -> Self { + pub fn new(generated_date: i32, hour: u8, minute: u8, second: u8) -> Self { Self { date_index: generated_date - MIN_GENERATE_DATE, hour, minute, + second, } } - // pub fn from_ymdhm(generated_date: i32, hour: u8, minute: u8) -> Self { - // Self { - // date_index: generated_date - MIN_GENERATE_DATE, - // hour, - // minute, - // } - // } - - // Example: add minutes to the datetime - pub fn add_minutes(&self, minutes: i32) -> Self { - let total_minutes = self.hour as i32 * 60 + self.minute as i32 + minutes; - let days_added = total_minutes.div_euclid(1440); - let new_minutes = total_minutes.rem_euclid(1440); - let new_hour = (new_minutes / 60) as u8; - let new_minute = (new_minutes % 60) as u8; - Self { - date_index: self.date_index + days_added, - hour: new_hour, - minute: new_minute, - } + /// Create a new date with a given day value and time components including seconds + pub fn new_with_time(day_value: i32, (hour, minute, second): (u8, u8, u8)) -> Self { + TPCHDate::new(day_value, hour, minute, second) } /// Returns the (year, month, day) of this date @@ -275,34 +299,37 @@ mod test { use super::*; #[test] fn test_date_strings() { - let date = TPCHDate::new(MIN_GENERATE_DATE + 1, 0, 0); - assert_eq!(date.to_string(), "1992-01-02 00:00"); + let date = TPCHDate::new(MIN_GENERATE_DATE + 1, 0, 0, 0); + assert_eq!(date.to_string(), "1992-01-02 00:00:00"); - let date = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0); - assert_eq!(date.to_string(), "1995-05-19 00:00"); + let date = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0, 0); + assert_eq!(date.to_string(), "1995-05-19 00:00:00"); - let date = TPCHDate::new(MIN_GENERATE_DATE + TOTAL_DATE_RANGE - 1, 0, 0); - assert_eq!(date.to_string(), "1998-12-31 00:00"); + let date = TPCHDate::new(MIN_GENERATE_DATE + TOTAL_DATE_RANGE - 1, 0, 0, 0); + assert_eq!(date.to_string(), "1998-12-31 00:00:00"); } - // #[test] - // fn test_display_dates() { - // for index in [1, 23, 321, 623, 1234, 2345, 2556] { - // let date = TPCHDate::new(MIN_GENERATE_DATE + index, 0, 0); - // let (y, m, dy) = date.to_ymd(); - // assert_eq!(format_ymd(y, m, dy), date.to_string()); - // } - // } + #[test] + fn test_display_dates() { + for index in [1, 23, 321, 623, 1234, 2345, 2556] { + let date = TPCHDate::new(MIN_GENERATE_DATE + index, 10, 30, 45); + let (y, m, dy) = date.to_ymd(); + assert_eq!( + format!("{} 10:30:45", format_ymd(y, m, dy)), + date.to_string() + ); + } + } #[test] fn test_date_epoch_consistency() { // Check that dates are actually machine some epochs. - let date = TPCHDate::new(MIN_GENERATE_DATE + 1, 0, 0); + let date = TPCHDate::new(MIN_GENERATE_DATE + 1, 0, 0, 0); assert_eq!(date.to_unix_epoch(), 8036); - let date = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0); + let date = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0, 0); // 1995-05-19 00:00:00 (12:00:00 AM) - assert_eq!(date.to_string(), "1995-05-19 00:00"); + assert_eq!(date.to_string(), "1995-05-19 00:00:00"); assert_eq!(date.to_unix_epoch(), 9269); } } diff --git a/tpchgen/src/generators.rs b/tpchgen/src/generators.rs index 04c3b17..ce66b1c 100644 --- a/tpchgen/src/generators.rs +++ b/tpchgen/src/generators.rs @@ -13,8 +13,12 @@ use crate::spider::{spider_seed_for_index, SpiderGenerator}; use crate::spider_presets::SpiderPresets; use crate::text::TextPool; use duckdb::Connection; +use geo::Geometry; +use geo::Point; +use geozero::{wkb::Wkb, ToGeo}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; +use std::convert::TryInto; use std::fmt; use std::fmt::Display; @@ -109,7 +113,7 @@ impl<'a> VehicleGenerator<'a> { /// Creates a new VehicleGenerator with the given scale factor /// - /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for + /// Note the generator's lifetime is `&'static`. See [`VehicleGenerator`] for /// more details. pub fn new(scale_factor: f64, part: i32, part_count: i32) -> VehicleGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` @@ -384,7 +388,7 @@ impl<'a> DriverGenerator<'a> { /// Creates a new DriverGenerator with the given scale factor /// - /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for + /// Note the generator's lifetime is `&'static`. See [`DriverGenerator`] for /// more details. pub fn new(scale_factor: f64, part: i32, part_count: i32) -> DriverGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` @@ -658,7 +662,7 @@ impl<'a> CustomerGenerator<'a> { /// Creates a new CustomerGenerator with the given scale factor /// - /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for + /// Note the generator's lifetime is `&'static`. See [`CustomerGenerator`] for /// more details. pub fn new(scale_factor: f64, part: i32, part_count: i32) -> CustomerGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` @@ -844,16 +848,16 @@ pub struct Trip { /// Trip distance pub t_distance: TPCHDecimal, /// Trip pickup coordinates - pub t_pickuploc: String, + pub t_pickuploc: Point, /// Trip dropoff coordinates - pub t_dropoffloc: String, + pub t_dropoffloc: Point, } impl Display for Trip { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|", + "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{:?}|{:?}|", self.t_tripkey, self.t_custkey, self.t_driverkey, @@ -973,8 +977,7 @@ pub struct TripGeneratorIterator { driver_key_random: RandomBoundedLong, vehicle_key_random: RandomBoundedLong, pickup_date_random: RandomBoundedInt, - hour_random: RandomBoundedInt, - minute_random: RandomBoundedInt, + pickup_time_random: dates::RandomTimeOfDay, fare_per_mile_random: RandomBoundedInt, tip_percent_random: RandomBoundedInt, trip_minutes_per_mile_random: RandomBoundedInt, @@ -1017,8 +1020,7 @@ impl TripGeneratorIterator { dates::MIN_GENERATE_DATE, dates::MIN_GENERATE_DATE + dates::TOTAL_DATE_RANGE - 1, ); - let mut hour_random = RandomBoundedInt::new(123456789, 0, 23); - let mut minute_random = RandomBoundedInt::new(987654321, 0, 59); + let mut pickup_time_random = dates::RandomTimeOfDay::new(123456789); let mut fare_per_mile_random = RandomBoundedInt::new( 109837462, @@ -1040,8 +1042,7 @@ impl TripGeneratorIterator { driver_key_random.advance_rows(start_index); vehicle_key_random.advance_rows(start_index); pickup_date_random.advance_rows(start_index); - hour_random.advance_rows(start_index); - minute_random.advance_rows(start_index); + pickup_time_random.advance_rows(start_index); fare_per_mile_random.advance_rows(start_index); tip_percent_random.advance_rows(start_index); trip_minutes_per_mile_random.advance_rows(start_index); @@ -1051,8 +1052,7 @@ impl TripGeneratorIterator { driver_key_random, vehicle_key_random, pickup_date_random, - hour_random, - minute_random, + pickup_time_random, fare_per_mile_random, tip_percent_random, trip_minutes_per_mile_random, @@ -1088,28 +1088,20 @@ impl TripGeneratorIterator { ); let pickup_date_value = self.pickup_date_random.next_value(); - - // After (with random hour/minute as example): - let hour = self.hour_random.next_value(); - let minute = self.minute_random.next_value(); - let pickup_date = TPCHDate::new(pickup_date_value, hour as u8, minute as u8); + let pickup_time = self.pickup_time_random.next_value(); + let pickup_date = TPCHDate::new_with_time(pickup_date_value, pickup_time); // Get distance from KDE model (in miles with decimal precision) let distance_value = self.distance_kde.generate(trip_key as u64); let distance = TPCHDecimal((distance_value * 100.0) as i64); // Pickup - let pickuploc = self.spatial_gen.generate(trip_key as u64); - - // Extract just the coordinates part by removing "POINT (" and ")" - let coords_str = pickuploc - .trim_start_matches("POINT (") - .trim_end_matches(")"); - let coords: Vec<&str> = coords_str.split_whitespace().collect(); - - // Parse the coordinates directly - let pickup_x = coords[0].parse::<f64>().unwrap(); - let pickup_y = coords[1].parse::<f64>().unwrap(); + let pickuploc_geom = self.spatial_gen.generate(trip_key as u64); + let pickuploc: Point = pickuploc_geom + .try_into() + .expect("Failed to convert to point"); + let pickup_x = pickuploc.x(); + let pickup_y = pickuploc.y(); // Angle let angle_seed = spider_seed_for_index(trip_key as u64, 1234); @@ -1119,9 +1111,8 @@ impl TripGeneratorIterator { // Dropoff via polar projection let dropoff_x = pickup_x + distance_value * angle.cos(); let dropoff_y = pickup_y + distance_value * angle.sin(); - let dropoffloc = format!("POINT ({} {})", dropoff_x, dropoff_y); + let dropoffloc = Point::new(dropoff_x, dropoff_y); - // Fix multiplication of f64 by integers by using f64 literals let fare_per_mile = self.fare_per_mile_random.next_value() as f64; let fare_value = (distance_value * fare_per_mile) / 100.0; let fare = TPCHDecimal((fare_value * 100.0) as i64); // Use 100.0 (float) instead of 100 (int) @@ -1134,15 +1125,21 @@ impl TripGeneratorIterator { let total = TPCHDecimal((total_value * 100.0) as i64); // Use 100.0 instead of 100 // Calculate trip duration based on distance - let minutes_per_mile = 3000; - let distance_miles = distance_value; - let duration_minutes = (distance_miles * minutes_per_mile as f64).round() as i32; - - let total_minutes = hour * 60 + minute + duration_minutes; - let dropoff_hour = (total_minutes / 60) % 24; - let dropoff_minute = total_minutes % 60; - let day_delta = total_minutes / (24 * 60); + let seconds_per_degree = 180000; + let duration_seconds = (distance_value * seconds_per_degree as f64).round() as i32; + + // Get hours and minutes from pickup time + let (pickup_hour, pickup_minute, pickup_second) = pickup_time; + let total_seconds = (pickup_hour as i32) * 3600 + + (pickup_minute as i32) * 60 + + (pickup_second as i32) + + duration_seconds; + let dropoff_hour = ((total_seconds / 3600) % 24) as u8; + let dropoff_minute = ((total_seconds % 3600) / 60) as u8; + let dropoff_second = (total_seconds % 60) as u8; + let day_delta = total_seconds / (24 * 3600); let dropoff_day = pickup_date_value + day_delta; + // Ensure the dropoff day doesn't exceed the maximum date value let bounded_dropoff_day = std::cmp::min( dropoff_day, @@ -1150,8 +1147,9 @@ impl TripGeneratorIterator { ); let dropoff_date = TPCHDate::new( bounded_dropoff_day, - dropoff_hour as u8, - dropoff_minute as u8, + dropoff_hour, + dropoff_minute, + dropoff_second, ); Trip { @@ -1186,6 +1184,7 @@ impl Iterator for TripGeneratorIterator { self.driver_key_random.row_finished(); self.vehicle_key_random.row_finished(); self.pickup_date_random.row_finished(); + self.pickup_time_random.row_finished(); self.fare_per_mile_random.row_finished(); self.tip_percent_random.row_finished(); self.trip_minutes_per_mile_random.row_finished(); @@ -1204,14 +1203,14 @@ pub struct Building<'a> { /// Name of the building pub b_name: StringSequenceInstance<'a>, /// WKT representation of the building's polygon - pub b_boundary: String, + pub b_boundary: geo::Polygon, } impl Display for Building<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "{}|{}|{}|", + "{}|{}|{:?}|", self.b_buildingkey, self.b_name, self.b_boundary, ) } @@ -1234,9 +1233,9 @@ impl<'a> BuildingGenerator<'a> { const NAME_WORDS: i32 = 1; const COMMENT_AVERAGE_LENGTH: i32 = 14; - /// Creates a new VehicleGenerator with the given scale factor + /// Creates a new BuildingGenerator with the given scale factor /// - /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for + /// Note the generator's lifetime is `&'static`. See [`BuildingGenerator`] for /// more details. pub fn new(scale_factor: f64, part: i32, part_count: i32) -> BuildingGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` @@ -1352,12 +1351,13 @@ impl<'a> BuildingGeneratorIterator<'a> { /// Creates a part with the given key fn make_building(&mut self, building_key: i64) -> Building<'a> { let name = self.name_random.next_value(); - let wkt = self.spatial_gen.generate(building_key as u64); + let geom = self.spatial_gen.generate(building_key as u64); + let polygon: geo::Polygon = geom.try_into().expect("Failed to convert to polygon"); Building { b_buildingkey: building_key, b_name: name, - b_boundary: wkt, + b_boundary: polygon, } } } @@ -1387,20 +1387,30 @@ pub struct Zone { pub z_zonekey: i64, /// GERS ID of the zone pub z_gersid: String, + /// Country of the zone + pub z_country: String, + /// Region of the zone + pub z_region: String, /// Name of the zone pub z_name: String, /// Subtype of the zone pub z_subtype: String, /// Boundary geometry in WKT format - pub z_boundary: String, + pub z_boundary: Geometry, } impl Display for Zone { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "{}|{}|{}|{}|{}|", - self.z_zonekey, self.z_gersid, self.z_name, self.z_subtype, self.z_boundary + "{}|{}|{}|{}|{}|{}|{:?}|", + self.z_zonekey, + self.z_gersid, + self.z_country, + self.z_region, + self.z_name, + self.z_subtype, + self.z_boundary ) } } @@ -1408,12 +1418,14 @@ impl Display for Zone { /// Generator for [`Zone`]s that loads from a parquet file in S3 #[derive(Debug, Clone)] pub struct ZoneGenerator { + scale_factor: f64, zones: Vec<Zone>, part: i32, part_count: i32, } impl ZoneGenerator { + const SCALE_BASE: i32 = 867_102; /// S3 URL for the zones parquet file const OVERTURE_RELEASE_DATE: &'static str = "2025-06-25.0"; const OVERTURE_S3_BUCKET: &'static str = "overturemaps-us-west-2"; @@ -1428,22 +1440,25 @@ impl ZoneGenerator { Self::OVERTURE_RELEASE_DATE ) } - // (OVERTURE_RELEASE_DATE,"s3://overturemaps-us-west-2/release/2025-06-25.0/theme=divisions/type=division_area/*"); /// Creates a new ZoneGenerator that loads data from S3 - pub fn new(_scale_factor: f64, part: i32, part_count: i32) -> ZoneGenerator { - // Load zones from parquet file in S3 - let zones = Self::load_zones_from_s3(); - - ZoneGenerator { - zones, + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> ZoneGenerator { + // construct temporary ZoneGenerator with empty zones + let mut generator = ZoneGenerator { + scale_factor, part, part_count, - } + zones: Vec::new(), + }; + + let zones = generator.load_zones_from_s3(); + generator.zones = zones; + + generator } /// Loads zone data from S3 parquet file using DuckDB - fn load_zones_from_s3() -> Vec<Zone> { + fn load_zones_from_s3(&self) -> Vec<Zone> { // Create a connection to DuckDB let conn = Connection::open_in_memory().expect("Failed to open DuckDB connection"); @@ -1457,41 +1472,45 @@ impl ZoneGenerator { conn.execute("LOAD spatial;", []) .expect("Failed to load spatial"); - // Set S3 region - conn.execute("SET s3_region='us-west-2';", []) - .expect("Failed to set S3 region"); - - // Query the parquet file directly - Cast the division_id to BIGINT - let mut stmt = conn - .prepare( - "SELECT - id as z_gersid, - COALESCE(names.primary, '') as z_name, - subtype as z_subtype, - ST_AsText(geometry) as z_boundary - FROM read_parquet(?1, hive_partitioning=1) - WHERE subtype IN ('county', 'locality', 'neighbourhood')", - ) - .expect("Failed to prepare query"); - let zones_url = Self::get_zones_parquet_url(); + + // Compute the limit based on scale factor + let limit = (self.scale_factor * Self::SCALE_BASE as f64).ceil() as i64; + + let query = format!( + "SELECT + id as z_gersid, + country as z_country, + COALESCE(region, '') as z_region, + COALESCE(names.primary, '') as z_name, + subtype as z_subtype, + ST_AsWKB(geometry) as z_boundary + FROM read_parquet('{}', hive_partitioning=1) + WHERE subtype IN ('localadmin', 'locality', 'neighborhood') + LIMIT {};", + zones_url, limit + ); + + let mut stmt = conn.prepare(&query).unwrap(); + let mut rows = stmt.query([]).unwrap(); + let mut zones = Vec::new(); // Counter for primary key let mut zone_id = 1; - let mut rows = stmt.query([&zones_url]).expect("Failed to execute query"); while let Ok(Some(row)) = rows.next() { - // Read the row values - let zone = Zone { - z_zonekey: zone_id, - z_gersid: row.get(0).expect("Failed to read gers_id"), - z_name: row.get(1).expect("Failed to read name"), - z_subtype: row.get(2).expect("Failed to read subtype"), - z_boundary: row.get(3).expect("Failed to read wkt"), - }; - - zones.push(zone); + let wkb_bytes: Vec<u8> = row.get(5).unwrap(); + let geometry: Geometry = Wkb(&wkb_bytes).to_geo().unwrap(); + zones.push(Zone { + z_zonekey: zone_id, + z_gersid: row.get(0).unwrap(), + z_country: row.get(1).unwrap(), + z_region: row.get(2).unwrap(), + z_name: row.get(3).unwrap(), + z_subtype: row.get(4).unwrap(), + z_boundary: geometry, + }); zone_id += 1; } @@ -1600,7 +1619,6 @@ impl Iterator for ZoneGeneratorIterator { #[cfg(test)] mod tests { use super::*; - #[test] fn test_vehicle_generation() { // Create a generator with a small scale factor @@ -1643,7 +1661,7 @@ mod tests { let generator = CustomerGenerator::new(0.01, 1, 1); let customers: Vec<_> = generator.iter().collect(); - // Should have 0.01 * 150,000 = 1,500 customers + // Should have 0.01 * 30,000 = 300 customers assert_eq!(customers.len(), 300); // Check first customer @@ -1667,14 +1685,14 @@ mod tests { ); assert_eq!(first.to_string(), expected_pattern); } - + #[test] fn test_trip_generation() { // Create a generator with a small scale factor let generator = TripGenerator::new(0.01, 1, 1); let trips: Vec<_> = generator.iter().collect(); - // Should have 0.01 * 1,000,000 = 10,000 trips + // Should have 0.01 * 6,000,000 = 60,000 trips assert_eq!(trips.len(), 60_000); // Check first trip @@ -1685,18 +1703,11 @@ mod tests { assert!(first.t_vehiclekey > 0); // Check that pickup date is before or equal to dropoff date - // TPCHDate doesn't have a .0 field, use date comparison instead - // assert!(first.t_pickuptime <= first.t_dropofftime); - - // Check that the financial values make sense - // assert!(first.t_fare.0 > 0); - // assert!(first.t_tip.0 >= 0); // Tip could be zero - // assert_eq!(first.t_totalamount.0, first.t_fare.0 + first.t_tip.0); - // assert!(first.t_distance.0 > 0); + assert!(first.t_pickuptime <= first.t_dropofftime); // Verify the string format matches the expected pattern let expected_pattern = format!( - "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|", + "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{:?}|{:?}|", first.t_tripkey, first.t_custkey, first.t_driverkey, @@ -1715,7 +1726,7 @@ mod tests { // Check first Trip let first = &trips[1]; assert_eq!(first.t_tripkey, 2); - assert_eq!(first.to_string(), "2|172|1|1|1997-12-24 22:50|1997-12-24 23:32|0.03|0.00|0.04|0.01|POINT (-123.30659706835938 33.6437762421875)|POINT (-123.29286225833363 33.64593281462752)|"); + assert_eq!(first.to_string(), "2|172|1|1|1997-12-24 08:47:14|1997-12-24 09:28:57|0.03|0.00|0.04|0.01|POINT(-168.046875 -21.09375)|POINT(-168.03314018997426 -21.091593427559978)|"); } #[test] @@ -1733,7 +1744,7 @@ mod tests { // Verify the string format matches the expected pattern let expected_pattern = format!( - "{}|{}|{}|", + "{}|{}|{:?}|", first.b_buildingkey, first.b_name, first.b_boundary, ); assert_eq!(first.to_string(), expected_pattern); @@ -1741,23 +1752,23 @@ mod tests { // Check first Building let first = &buildings[1]; assert_eq!(first.b_buildingkey, 2); - assert_eq!(first.to_string(), "2|blush|POLYGON ((-102.2154579691 40.5193652499, -102.2133112848 40.5193652499, -102.2133112848 40.5207006446, -102.2154579691 40.5207006446, -102.2154579691 40.5193652499))|") + assert_eq!(first.to_string(), "2|blush|POLYGON((-37.962323825156744 28.065637750265665,-37.94908364554638 28.065637750265665,-37.94908364554638 28.075185613992147,-37.962323825156744 28.075185613992147,-37.962323825156744 28.065637750265665))|") } #[test] fn test_zone_generation() { // Create a generator with a small scale factor - let generator = ZoneGenerator::new(0.1, 1, 1); + let generator = ZoneGenerator::new(0.001, 1, 1); let zones: Vec<_> = generator.into_iter().collect(); - assert_eq!(zones.len(), 596124); + assert_eq!(zones.len(), 868); // Check first Driver let first = &zones[0]; assert_eq!(first.z_zonekey, 1); assert_eq!( first.to_string(), - "1|54bea793-2dc6-47b0-a4c1-5b96f17e66a3|Chatham Islands Territory|county|MULTIPOLYGON (((-176.2418754 -44.4327352, -176.2396744 -44.4349882, -176.2379244 -44.4330281, -176.2384204 -44.4312342, -176.2418754 -44.4327352)), ((-176.165218 -44.3563138, -176.1650533 -44.3413916, -176.1773808 -44.3358569, -176.18558 -44.3493409, -176.165218 -44.3563138)), ((-176.2463812 -44.3292996, -176.25687 -44.3447818, -176.2382722 -44.3507201, -176.2271372 -44.334208, -176.2025537 -44.3268945, [...] + "1|b40981d8-1a8b-4b30-bbdc-2a2d941bfa4f|PF||Anapoto|locality|POLYGON((-152.8059003 -22.6387783,-152.8063121 -22.6353325,-152.8063274 -22.6352309,-152.8064935 -22.6352445,-152.806615 -22.6352496,-152.8068727 -22.6352603,-152.8070173 -22.6352663,-152.8072428 -22.6352461,-152.8073888 -22.6352422,-152.8075809 -22.6352564,-152.8076508 -22.6352615,-152.8080525 -22.6353115,-152.8082102 -22.6353388,-152.8083864 -22.6353691,-152.8087408 -22.635439,-152.8089964 -22.6354851,-152.809157 [...] ) } } diff --git a/tpchgen/src/lib.rs b/tpchgen/src/lib.rs index 2c6194e..262de8e 100644 --- a/tpchgen/src/lib.rs +++ b/tpchgen/src/lib.rs @@ -6,30 +6,30 @@ //! //! # Example: TBL output format //! ``` -//! # use tpchgen::generators::LineItemGenerator; -//! // Create Generator for the LINEITEM table at Scale Factor 1 (SF 1) +//! # use tpchgen::generators::TripGenerator; +//! // Create Generator for the TRIP table at Scale Factor 1 (SF 1) //! let scale_factor = 1.0; //! let part = 1; //! let num_parts = 1; -//! let generator = LineItemGenerator::new(scale_factor, part, num_parts); +//! let generator = TripGenerator::new(scale_factor, part, num_parts); //! //! // Output the first 3 rows in classic TPCH TBL format //! // (the generators are normal rust iterators and combine well with the Rust ecosystem) -//! let lines: Vec<_> = generator.iter() +//! let trips: Vec<_> = generator.iter() //! .take(3) -//! .map(|line| line.to_string()) // use Display impl to get TBL format +//! .map(|trips| trips.to_string()) // use Display impl to get TBL format //! .collect::<Vec<_>>(); //! assert_eq!( -//! lines.join("\n"),"\ -//! 1|155190|7706|1|17|21168.23|0.04|0.02|N|O|1996-03-13|1996-02-12|1996-03-22|DELIVER IN PERSON|TRUCK|egular courts above the|\n\ -//! 1|67310|7311|2|36|45983.16|0.09|0.06|N|O|1996-04-12|1996-02-28|1996-04-20|TAKE BACK RETURN|MAIL|ly final dependencies: slyly bold |\n\ -//! 1|63700|3701|3|8|13309.60|0.10|0.02|N|O|1996-01-29|1996-03-05|1996-01-31|TAKE BACK RETURN|REG AIR|riously. regular, express dep|" +//! trips.join("\n"),"\ +//! 1|21425|47|46|1997-07-24 06:58:22|1997-07-24 13:59:54|0.34|0.02|0.37|0.14|POINT(-172.96875 -90.0)|POINT(-172.98257407932567 -90.13982815963308)|\n\ +//! 2|17012|66|65|1997-12-24 08:47:14|1997-12-24 09:28:57|0.03|0.00|0.04|0.01|POINT(-168.046875 -21.09375)|POINT(-168.03314018997426 -21.091593427559978)|\n\ +//! 3|4454|68|67|1993-06-27 13:27:07|1993-06-27 13:34:51|0.00|0.00|0.00|0.00|POINT(-171.5625 -78.3984375)|POINT(-171.56429290849482 -78.40028771516948)|" //! ); //! ``` //! //! The TPC-H dataset is composed of several tables with foreign key relations //! between them. For each table we implement and expose a generator that uses -//! the iterator API to produce structs e.g [`LineItem`] that represent a single +//! the iterator API to produce structs e.g [`Trip`] that represent a single //! row. //! //! For each struct type we expose several facilities that allow fast conversion @@ -38,10 +38,10 @@ //! This crate currently supports the following output formats: //! //! - TBL: The `Display` impl of the row structs produces the TPCH TBL format. -//! - CSV: the [`csv`] module has formatters for CSV output (e.g. [`LineItemCsv`]). +//! - CSV: the [`csv`] module has formatters for CSV output (e.g. [`TripCsv`]). //! -//! [`LineItem`]: generators::LineItem -//! [`LineItemCsv`]: csv::LineItemCsv +//! [`Trip`]: generators::Trip +//! [`TripCsv`]: csv::TripCsv //! //! //! The library was designed to be easily integrated in existing Rust projects as diff --git a/tpchgen/src/spider.rs b/tpchgen/src/spider.rs index 890c4cd..46331cc 100644 --- a/tpchgen/src/spider.rs +++ b/tpchgen/src/spider.rs @@ -1,3 +1,4 @@ +use geo::{coord, Geometry, LineString, Point, Polygon}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use std::f64::consts::PI; @@ -57,7 +58,7 @@ impl SpiderGenerator { Self { config } } - pub fn generate(&self, index: u64) -> String { + pub fn generate(&self, index: u64) -> Geometry { let seed = spider_seed_for_index(index, self.config.seed as u64); let mut rng = StdRng::seed_from_u64(seed); @@ -70,26 +71,26 @@ impl SpiderGenerator { } } - fn generate_uniform(&self, rng: &mut StdRng) -> String { + fn generate_uniform(&self, rng: &mut StdRng) -> Geometry { let x = rand_unit(rng); let y = rand_unit(rng); match self.config.geom_type { - GeomType::Point => generate_point_wkt((x, y), &self.config), - GeomType::Box => generate_box_wkt((x, y), &self.config, rng), - GeomType::Polygon => generate_polygon_wkt((x, y), &self.config, rng), + GeomType::Point => generate_point_geom((x, y), &self.config), + GeomType::Box => generate_box_geom((x, y), &self.config, rng), + GeomType::Polygon => generate_polygon_geom((x, y), &self.config, rng), } } - fn generate_normal(&self, rng: &mut StdRng) -> String { + fn generate_normal(&self, rng: &mut StdRng) -> Geometry { match self.config.params { DistributionParams::Normal { mu, sigma } => { let x = rand_normal(rng, mu, sigma).clamp(0.0, 1.0); let y = rand_normal(rng, mu, sigma).clamp(0.0, 1.0); match self.config.geom_type { - GeomType::Point => generate_point_wkt((x, y), &self.config), - GeomType::Box => generate_box_wkt((x, y), &self.config, rng), - GeomType::Polygon => generate_polygon_wkt((x, y), &self.config, rng), + GeomType::Point => generate_point_geom((x, y), &self.config), + GeomType::Box => generate_box_geom((x, y), &self.config, rng), + GeomType::Polygon => generate_polygon_geom((x, y), &self.config, rng), } } _ => panic!( @@ -99,7 +100,7 @@ impl SpiderGenerator { } } - fn generate_diagonal(&self, rng: &mut StdRng) -> String { + fn generate_diagonal(&self, rng: &mut StdRng) -> Geometry { match self.config.params { DistributionParams::Diagonal { percentage, buffer } => { let (x, y) = if rng.gen::<f64>() < percentage { @@ -114,9 +115,9 @@ impl SpiderGenerator { }; match self.config.geom_type { - GeomType::Point => generate_point_wkt((x, y), &self.config), - GeomType::Box => generate_box_wkt((x, y), &self.config, rng), - GeomType::Polygon => generate_polygon_wkt((x, y), &self.config, rng), + GeomType::Point => generate_point_geom((x, y), &self.config), + GeomType::Box => generate_box_geom((x, y), &self.config, rng), + GeomType::Polygon => generate_polygon_geom((x, y), &self.config, rng), } } _ => panic!( @@ -126,7 +127,7 @@ impl SpiderGenerator { } } - fn generate_bit(&self, rng: &mut StdRng) -> String { + fn generate_bit(&self, rng: &mut StdRng) -> Geometry { match self.config.params { DistributionParams::Bit { probability, @@ -136,9 +137,9 @@ impl SpiderGenerator { let y = spider_bit(rng, probability, digits); match self.config.geom_type { - GeomType::Point => generate_point_wkt((x, y), &self.config), - GeomType::Box => generate_box_wkt((x, y), &self.config, rng), - GeomType::Polygon => generate_polygon_wkt((x, y), &self.config, rng), + GeomType::Point => generate_point_geom((x, y), &self.config), + GeomType::Box => generate_box_geom((x, y), &self.config, rng), + GeomType::Polygon => generate_polygon_geom((x, y), &self.config, rng), } } _ => panic!( @@ -148,7 +149,7 @@ impl SpiderGenerator { } } - fn generate_sierpinski(&self, rng: &mut StdRng) -> String { + fn generate_sierpinski(&self, rng: &mut StdRng) -> Geometry { let (mut x, mut y) = (0.0, 0.0); let a = (0.0, 0.0); let b = (1.0, 0.0); @@ -171,9 +172,9 @@ impl SpiderGenerator { } match self.config.geom_type { - GeomType::Point => generate_point_wkt((x, y), &self.config), - GeomType::Box => generate_box_wkt((x, y), &self.config, rng), - GeomType::Polygon => generate_polygon_wkt((x, y), &self.config, rng), + GeomType::Point => generate_point_geom((x, y), &self.config), + GeomType::Box => generate_box_geom((x, y), &self.config, rng), + GeomType::Polygon => generate_polygon_geom((x, y), &self.config, rng), } } } @@ -218,6 +219,13 @@ fn spider_bit(rng: &mut StdRng, prob: f64, digits: u32) -> f64 { .sum() } +pub fn generate_point_geom(center: (f64, f64), config: &SpiderConfig) -> Geometry { + let (x, y) = config + .affine + .map_or(center, |aff| apply_affine(center.0, center.1, &aff)); + Geometry::Point(Point::new(x, y)) +} + pub fn generate_point_wkt(center: (f64, f64), config: &SpiderConfig) -> String { let (x, y) = if let Some(aff) = &config.affine { apply_affine(center.0, center.1, aff) @@ -227,6 +235,27 @@ pub fn generate_point_wkt(center: (f64, f64), config: &SpiderConfig) -> String { format!("POINT ({} {})", x, y) } +pub fn generate_box_geom(center: (f64, f64), config: &SpiderConfig, rng: &mut StdRng) -> Geometry { + let half_width = rand_unit(rng) * config.width / 2.0; + let half_height = rand_unit(rng) * config.height / 2.0; + + let corners = [ + (center.0 - half_width, center.1 - half_height), + (center.0 + half_width, center.1 - half_height), + (center.0 + half_width, center.1 + half_height), + (center.0 - half_width, center.1 + half_height), + (center.0 - half_width, center.1 - half_height), + ]; + + let coords: Vec<_> = corners + .iter() + .map(|&(x, y)| config.affine.map_or((x, y), |aff| apply_affine(x, y, &aff))) + .map(|(x, y)| coord! { x: x, y: y }) + .collect(); + + Geometry::Polygon(Polygon::new(LineString::from(coords), vec![])) +} + pub fn generate_box_wkt(center: (f64, f64), config: &SpiderConfig, rng: &mut StdRng) -> String { let half_width = rand_unit(rng) * config.width / 2.0; let half_height = rand_unit(rng) * config.height / 2.0; @@ -254,6 +283,40 @@ pub fn generate_box_wkt(center: (f64, f64), config: &SpiderConfig, rng: &mut Std format!("POLYGON (({}))", coords.join(", ")) } +pub fn generate_polygon_geom( + center: (f64, f64), + config: &SpiderConfig, + rng: &mut StdRng, +) -> Geometry { + let min_segs = 3; + let num_segments = if config.maxseg <= 3 { + 3 + } else { + rng.gen_range(0..=(config.maxseg - min_segs)) + min_segs + }; + + let mut angles: Vec<f64> = (0..num_segments) + .map(|_| rand_unit(rng) * 2.0 * PI) + .collect(); + angles.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + let mut coords = angles + .iter() + .map(|angle| { + let (x, y) = ( + center.0 + config.polysize * angle.cos(), + center.1 + config.polysize * angle.sin(), + ); + config.affine.map_or((x, y), |aff| apply_affine(x, y, &aff)) + }) + .map(|(x, y)| coord! { x: x, y: y }) + .collect::<Vec<_>>(); + + coords.push(coords[0]); // close the ring + + Geometry::Polygon(Polygon::new(LineString::from(coords), vec![])) +} + pub fn generate_polygon_wkt(center: (f64, f64), config: &SpiderConfig, rng: &mut StdRng) -> String { let min_segs = 3; let num_segments = if config.maxseg <= 3 { diff --git a/tpchgen/src/spider_presets.rs b/tpchgen/src/spider_presets.rs index 11148b8..0ab4077 100644 --- a/tpchgen/src/spider_presets.rs +++ b/tpchgen/src/spider_presets.rs @@ -5,20 +5,19 @@ use crate::spider::{ pub struct SpiderPresets; impl SpiderPresets { + const FULL_WORLD_AFFINE: [f64; 6] = [ + 360.0, // Scale X to cover full longitude range (-180° to 180°) + 0.0, -180.0, // Offset X to start at -180° (west edge of map) + 0.0, 180.0, // Scale Y to cover full latitude range (-90° to 90°) + -90.0, // Offset Y to start at -90° (south edge of map) + ]; pub fn for_trip_pickups() -> SpiderGenerator { let config = SpiderConfig { dist_type: DistributionType::Uniform, geom_type: GeomType::Point, dim: 2, seed: 42, - affine: Some([ - 58.368269, - 0.0, - -125.244606, // scale X to 58.37°, offset to -125.24° - 0.0, - 25.175375, - 24.006328, // scale Y to 25.18°, offset to 24.00° - ]), + affine: Some(Self::FULL_WORLD_AFFINE), // geometry = box width: 0.0, @@ -39,14 +38,7 @@ impl SpiderPresets { geom_type: GeomType::Point, dim: 2, seed: 42, - affine: Some([ - 58.368269, - 0.0, - -125.244606, // scale X to 58.37°, offset to -125.24° - 0.0, - 25.175375, - 24.006328, // scale Y to 25.18°, offset to 24.00° - ]), + affine: Some(Self::FULL_WORLD_AFFINE), // geometry = box width: 0.0, @@ -70,14 +62,7 @@ impl SpiderPresets { geom_type: GeomType::Point, dim: 2, seed: 42, - affine: Some([ - 58.368269, - 0.0, - -125.244606, // scale X to 58.37°, offset to -125.24° - 0.0, - 25.175375, - 24.006328, // scale Y to 25.18°, offset to 24.00° - ]), + affine: Some(Self::FULL_WORLD_AFFINE), // geometry = box width: 0.0, @@ -98,14 +83,7 @@ impl SpiderPresets { geom_type: GeomType::Point, dim: 2, seed: 42, - affine: Some([ - 58.368269, - 0.0, - -125.244606, // scale X to 58.37°, offset to -125.24° - 0.0, - 25.175375, - 24.006328, // scale Y to 25.18°, offset to 24.00° - ]), + affine: Some(Self::FULL_WORLD_AFFINE), // geometry = box width: 0.0, @@ -129,14 +107,7 @@ impl SpiderPresets { geom_type: GeomType::Point, dim: 2, seed: 42, - affine: Some([ - 58.368269, - 0.0, - -125.244606, // scale X to 58.37°, offset to -125.24° - 0.0, - 25.175375, - 24.006328, // scale Y to 25.18°, offset to 24.00° - ]), + affine: Some(Self::FULL_WORLD_AFFINE), // geometry = box width: 0.0, @@ -160,14 +131,7 @@ impl SpiderPresets { geom_type: GeomType::Box, dim: 2, seed: 12345, - affine: Some([ - 58.368269, - 0.0, - -125.244606, // scale X to 58.37°, offset to -125.24° - 0.0, - 25.175375, - 24.006328, // scale Y to 25.18°, offset to 24.00° - ]), + affine: Some(Self::FULL_WORLD_AFFINE), // geometry = box width: 0.00005, diff --git a/tpchgen/tests/integration_tests.rs b/tpchgen/tests/integration_tests.rs index 4f25fea..58c4ea3 100644 --- a/tpchgen/tests/integration_tests.rs +++ b/tpchgen/tests/integration_tests.rs @@ -4,9 +4,6 @@ use flate2::read::GzDecoder; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::{Path, PathBuf}; -// use tpchgen::generators::{ -// CustomerGenerator, DriverGenerator, VehicleGenerator, -// }; fn read_tbl_gz<P: AsRef<Path>>(path: P) -> Vec<String> { let file = File::open(path).expect("Failed to open file");
