This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit fbdeafdd0cb0b355145b47bc0d85c566055ff044 Author: Pranav Toggi <[email protected]> AuthorDate: Wed Jun 25 13:13:32 2025 -0700 add skeleton for Building --- tpchgen-arrow/src/building.rs | 85 +++++++++ tpchgen-arrow/src/lib.rs | 2 + tpchgen-arrow/tests/reparse.rs | 14 +- tpchgen-cli/src/csv.rs | 5 +- tpchgen-cli/src/main.rs | 27 ++- tpchgen-cli/src/tbl.rs | 3 +- tpchgen/Cargo.toml | 1 - tpchgen/src/csv.rs | 46 ++++- tpchgen/src/generators.rs | 397 +++++++++++++++++++++++++++++++---------- 9 files changed, 469 insertions(+), 111 deletions(-) diff --git a/tpchgen-arrow/src/building.rs b/tpchgen-arrow/src/building.rs new file mode 100644 index 0000000..34a6ccb --- /dev/null +++ b/tpchgen-arrow/src/building.rs @@ -0,0 +1,85 @@ +use crate::conversions::string_view_array_from_display_iter; +use crate::{DEFAULT_BATCH_SIZE, RecordBatchIterator}; +use arrow::array::{Int64Array, RecordBatch, StringViewArray}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use std::sync::{Arc, LazyLock}; +use tpchgen::generators::{BuildingGenerator, BuildingGeneratorIterator}; + +/// Generate [`Building`]s in [`RecordBatch`] format +/// +/// [`Building`]: tpchgen::generators::Building +/// +/// # Example +/// ``` +/// # use tpchgen::generators::{BuildingGenerator}; +/// # use tpchgen_arrow::BuildingArrow; +/// +/// // Create a SF=1.0 generator and wrap it in an Arrow generator +/// let generator = BuildingGenerator::new(1.0, 1, 1); +/// let mut arrow_generator = BuildingArrow::new(generator) +/// .with_batch_size(10); +/// // Read the first batch +/// let batch = arrow_generator.next().unwrap(); +/// ``` +pub struct BuildingArrow { + inner: BuildingGeneratorIterator<'static>, + batch_size: usize, +} + +impl BuildingArrow { + pub fn new(generator: BuildingGenerator<'static>) -> Self { + Self { + inner: generator.iter(), + batch_size: DEFAULT_BATCH_SIZE, + } + } + + /// Set the batch size + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = batch_size; + self + } +} + +impl RecordBatchIterator for BuildingArrow { + fn schema(&self) -> &SchemaRef { + &BUILDING_SCHEMA + } +} + +impl Iterator for BuildingArrow { + type Item = RecordBatch; + + fn next(&mut self) -> Option<Self::Item> { + // Get next rows to convert + let rows: Vec<_> = self.inner.by_ref().take(self.batch_size).collect(); + if rows.is_empty() { + return None; + } + + let buildingkey = Int64Array::from_iter_values(rows.iter().map(|r| r.b_buildingkey)); + let name = string_view_array_from_display_iter(rows.iter().map(|r| &r.b_name)); + let polygon_wkt = StringViewArray::from_iter_values(rows.iter().map(|r| r.b_polygonwkt)); + + let batch = RecordBatch::try_new( + Arc::clone(self.schema()), + vec![ + Arc::new(buildingkey), + Arc::new(name), + Arc::new(polygon_wkt), + ], + ) + .unwrap(); + Some(batch) + } +} + +/// Schema for the Building +static BUILDING_SCHEMA: LazyLock<SchemaRef> = LazyLock::new(make_building_schema); +fn make_building_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("b_buildingkey", DataType::Int64, false), + Field::new("b_name", DataType::Utf8View, false), + Field::new("b_polygonwkt", DataType::Utf8View, false), + ])) +} \ No newline at end of file diff --git a/tpchgen-arrow/src/lib.rs b/tpchgen-arrow/src/lib.rs index 151fdf6..f729803 100644 --- a/tpchgen-arrow/src/lib.rs +++ b/tpchgen-arrow/src/lib.rs @@ -43,6 +43,7 @@ mod vehicle; mod region; mod driver; mod trip; +mod building; use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; @@ -54,6 +55,7 @@ pub use vehicle::VehicleArrow; pub use region::RegionArrow; pub use driver::DriverArrow; pub use trip::TripArrow; +pub use building::BuildingArrow; /// Iterator of Arrow [`RecordBatch`] that also knows its schema pub trait RecordBatchIterator: Iterator<Item = RecordBatch> + Send { diff --git a/tpchgen-arrow/tests/reparse.rs b/tpchgen-arrow/tests/reparse.rs index acd0ae7..045a684 100644 --- a/tpchgen-arrow/tests/reparse.rs +++ b/tpchgen-arrow/tests/reparse.rs @@ -6,16 +6,16 @@ use arrow::datatypes::SchemaRef; use std::io::Write; use std::sync::Arc; use tpchgen::csv::{ - CustomerCsv, LineItemCsv, NationCsv, OrderCsv, VehicleCsv, RegionCsv, DriverCsv, + CustomerCsv, LineItemCsv, NationCsv, OrderCsv, VehicleCsv, RegionCsv, DriverCsv, TripCsv, BuildingCsv }; use tpchgen::generators::{ Customer, CustomerGenerator, LineItem, LineItemGenerator, Nation, NationGenerator, Order, OrderGenerator, Vehicle, VehicleGenerator, Region, RegionGenerator, - Driver, DriverGenerator, + Driver, DriverGenerator, TripGenerator, BuildingGenerator }; use tpchgen_arrow::{ CustomerArrow, LineItemArrow, NationArrow, OrderArrow, VehicleArrow, - RecordBatchIterator, RegionArrow, DriverArrow, + RecordBatchIterator, RegionArrow, DriverArrow, TripArrow, BuildingArrow }; /// Macro that defines tests for tbl for a given type @@ -44,12 +44,16 @@ test_row_type!(nation_tbl, NationGenerator, NationArrow, Test::tbl()); test_row_type!(nation_csv, NationGenerator, NationArrow, Test::csv()); test_row_type!(order_tbl, OrderGenerator, OrderArrow, Test::tbl()); test_row_type!(order_csv, OrderGenerator, OrderArrow, Test::csv()); -test_row_type!(part_tbl, VehicleGenerator, VehicleArrow, Test::tbl()); -test_row_type!(part_csv, VehicleGenerator, VehicleArrow, Test::csv()); +test_row_type!(vehicle_tbl, VehicleGenerator, VehicleArrow, Test::tbl()); +test_row_type!(vehicle_csv, VehicleGenerator, VehicleArrow, Test::csv()); test_row_type!(region_tbl, RegionGenerator, RegionArrow, Test::tbl()); test_row_type!(region_csv, RegionGenerator, RegionArrow, Test::csv()); test_row_type!(driver_tbl, DriverGenerator, DriverArrow, Test::tbl()); test_row_type!(driver_csv, DriverGenerator, DriverArrow, Test::csv()); +// test_row_type!(trip_tbl, TripGenerator, TripArrow, Test::tbl()); +// test_row_type!(trip_csv, TripGenerator, TripArrow, Test::csv()); +// test_row_type!(building_tbl, BuildingGenerator, BuildingArrow, Test::tbl()); +// test_row_type!(building_csv, BuildingGenerator, BuildingArrow, Test::csv()); /// Common trait for writing rows in TBL and CSV format trait RowType { diff --git a/tpchgen-cli/src/csv.rs b/tpchgen-cli/src/csv.rs index 9d01ed0..f58bff7 100644 --- a/tpchgen-cli/src/csv.rs +++ b/tpchgen-cli/src/csv.rs @@ -2,10 +2,10 @@ use super::generate::Source; use std::io::Write; use tpchgen::csv::{ - CustomerCsv, LineItemCsv, NationCsv, OrderCsv, VehicleCsv, RegionCsv, DriverCsv, TripCsv + CustomerCsv, LineItemCsv, NationCsv, OrderCsv, VehicleCsv, RegionCsv, DriverCsv, TripCsv, BuildingCsv }; use tpchgen::generators::{ - CustomerGenerator, LineItemGenerator, NationGenerator, OrderGenerator, VehicleGenerator, RegionGenerator, DriverGenerator, TripGenerator, + CustomerGenerator, LineItemGenerator, NationGenerator, OrderGenerator, VehicleGenerator, RegionGenerator, DriverGenerator, TripGenerator, BuildingGenerator, }; /// Define a Source that writes the table in CSV format @@ -49,3 +49,4 @@ define_csv_source!(CustomerCsvSource, CustomerGenerator<'static>, CustomerCsv); define_csv_source!(OrderCsvSource, OrderGenerator<'static>, OrderCsv); define_csv_source!(LineItemCsvSource, LineItemGenerator<'static>, LineItemCsv); define_csv_source!(TripCsvSource, TripGenerator, TripCsv); +define_csv_source!(BuildingCsvSource, BuildingGenerator<'static>, BuildingCsv); diff --git a/tpchgen-cli/src/main.rs b/tpchgen-cli/src/main.rs index d518d6f..fa2449a 100644 --- a/tpchgen-cli/src/main.rs +++ b/tpchgen-cli/src/main.rs @@ -62,12 +62,12 @@ use std::str::FromStr; use std::time::Instant; use tpchgen::distribution::Distributions; use tpchgen::generators::{ - CustomerGenerator, LineItemGenerator, NationGenerator, OrderGenerator, VehicleGenerator, RegionGenerator, DriverGenerator, TripGenerator, + CustomerGenerator, LineItemGenerator, NationGenerator, OrderGenerator, VehicleGenerator, RegionGenerator, DriverGenerator, TripGenerator, BuildingGenerator, }; use tpchgen::text::TextPool; use tpchgen_arrow::{ CustomerArrow, LineItemArrow, NationArrow, OrderArrow, VehicleArrow, - RecordBatchIterator, RegionArrow, DriverArrow, TripArrow + RecordBatchIterator, RegionArrow, DriverArrow, TripArrow, BuildingArrow, }; #[derive(Parser)] @@ -138,6 +138,7 @@ enum Table { Orders, Lineitem, Trip, + Building, } impl Display for Table { @@ -179,6 +180,7 @@ impl TypedValueParser for TableValueParser { clap::builder::PossibleValue::new("orders").help("Orders table (alias: O)"), clap::builder::PossibleValue::new("lineitem").help("LineItem table (alias: L)"), clap::builder::PossibleValue::new("trip").help("Trip table (alias: T)"), + clap::builder::PossibleValue::new("building").help("Trip table (alias: b)"), ] .into_iter(), )) @@ -204,6 +206,7 @@ impl FromStr for Table { "O" | "orders" => Ok(Table::Orders), "L" | "lineitem" => Ok(Table::Lineitem), "T" | "trip" => Ok(Table::Trip), + "b" | "building" => Ok(Table::Building), _ => Err("Invalid table name {s}"), } } @@ -219,7 +222,8 @@ impl Table { Table::Customer => "customer", Table::Orders => "orders", Table::Lineitem => "lineitem", - Table::Trip => "Trip", + Table::Trip => "trip", + Table::Building => "building", } } } @@ -316,12 +320,13 @@ impl Cli { match table { Table::Nation => self.generate_nation().await?, Table::Region => self.generate_region().await?, - Table::Vehicle => self.generate_part().await?, + Table::Vehicle => self.generate_vehicle().await?, Table::Driver => self.generate_driver().await?, Table::Customer => self.generate_customer().await?, Table::Orders => self.generate_orders().await?, Table::Lineitem => self.generate_lineitem().await?, Table::Trip => self.generate_trip().await?, + Table::Building => self.generate_building().await?, } } @@ -346,7 +351,7 @@ impl Cli { RegionArrow ); define_generate!( - generate_part, + generate_vehicle, Table::Vehicle, VehicleGenerator, VehicleTblSource, @@ -393,6 +398,14 @@ impl Cli { TripCsvSource, TripArrow ); + define_generate!( + generate_building, + Table::Building, + BuildingGenerator, + BuildingTblSource, + BuildingCsvSource, + BuildingArrow + ); /// return the output filename for the given table fn output_filename(&self, table: Table) -> String { @@ -454,6 +467,10 @@ impl Cli { (128, row_count) }, &Table::Trip => (130, TripGenerator::calculate_row_count(self.scale_factor, 1, 1)), + Table::Building => ( + 115, + BuildingGenerator::calculate_row_count(self.scale_factor, 1, 1), + ), }; // target chunks of about 16MB (use 15MB to ensure we don't exceed the target size) let target_chunk_size_bytes = 15 * 1024 * 1024; diff --git a/tpchgen-cli/src/tbl.rs b/tpchgen-cli/src/tbl.rs index 441a128..4ea9158 100644 --- a/tpchgen-cli/src/tbl.rs +++ b/tpchgen-cli/src/tbl.rs @@ -2,7 +2,7 @@ use super::generate::Source; use std::io::Write; -use tpchgen::generators::{CustomerGenerator, LineItemGenerator, NationGenerator, OrderGenerator, VehicleGenerator, RegionGenerator, DriverGenerator, TripGenerator}; +use tpchgen::generators::{CustomerGenerator, LineItemGenerator, NationGenerator, OrderGenerator, VehicleGenerator, RegionGenerator, DriverGenerator, TripGenerator, BuildingGenerator}; /// Define a Source that writes the table in TBL format macro_rules! define_tbl_source { @@ -43,3 +43,4 @@ define_tbl_source!(CustomerTblSource, CustomerGenerator<'static>); define_tbl_source!(OrderTblSource, OrderGenerator<'static>); define_tbl_source!(LineItemTblSource, LineItemGenerator<'static>); define_tbl_source!(TripTblSource, TripGenerator); +define_tbl_source!(BuildingTblSource, BuildingGenerator<'static>); diff --git a/tpchgen/Cargo.toml b/tpchgen/Cargo.toml index dc10c10..7ed89f8 100644 --- a/tpchgen/Cargo.toml +++ b/tpchgen/Cargo.toml @@ -13,7 +13,6 @@ license = { workspace = true } # See ../ARCHITECTURE.md for more details [dependencies] rand = { version = "0.8", features = ["small_rng"] } -rand_distr = "0.4.3" [dev-dependencies] flate2 = "1.1.0" diff --git a/tpchgen/src/csv.rs b/tpchgen/src/csv.rs index 9331f9b..d0fbea5 100644 --- a/tpchgen/src/csv.rs +++ b/tpchgen/src/csv.rs @@ -1,6 +1,6 @@ //! CSV formatting support for the row struct objects generated by the library. -use crate::generators::{Customer, LineItem, Nation, Order, Vehicle, Region, Driver, Trip}; +use crate::generators::{Customer, LineItem, Nation, Order, Vehicle, Region, Driver, Trip, Building}; use core::fmt; use std::fmt::Display; @@ -437,3 +437,47 @@ impl Display for TripCsv { ) } } + +/// Write [`Building`]s in CSV format. +/// +/// # Example +/// ``` +/// # use tpchgen::generators::BuildingGenerator; +/// # use tpchgen::csv::BuildingCsv; +/// # use std::fmt::Write; +/// // Output the first 3 rows in CSV format +/// let generator = BuildingGenerator::new(1.0, 1, 1); +/// let mut csv = String::new(); +/// writeln!(&mut csv, "{}", BuildingCsv::header()).unwrap(); // write header +/// for line in generator.iter().take(3) { +/// // write line using CSV formatter +/// writeln!(&mut csv, "{}", BuildingCsv::new(line)).unwrap(); +/// } +/// ``` +pub struct BuildingCsv<'a> { + inner: Building<'a>, +} + +impl<'a> BuildingCsv<'a> { + pub fn new(inner: Building<'a>) -> Self { + Self { inner } + } + + /// Returns the CSV header for the Building table + pub fn header() -> &'static str { + "b_buildingkey,b_name,b_polygonwkt" + } +} + +impl Display for BuildingCsv<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + // note must quote the comment field as it may contain commas + "{},{},\"{}\"", + self.inner.b_buildingkey, + self.inner.b_name, + self.inner.b_polygonwkt, + ) + } +} \ No newline at end of file diff --git a/tpchgen/src/generators.rs b/tpchgen/src/generators.rs index 694ac88..7d27dd8 100644 --- a/tpchgen/src/generators.rs +++ b/tpchgen/src/generators.rs @@ -5,7 +5,7 @@ use crate::distribution::Distribution; use crate::distribution::Distributions; use crate::random::RandomPhoneNumber; use crate::random::RowRandomInt; -use crate::random::{PhoneNumberInstance, RandomBoundedLong}; +use crate::random::{PhoneNumberInstance, RandomBoundedLong, StringSequenceInstance}; use crate::random::{RandomAlphaNumeric, RandomAlphaNumericInstance}; use crate::text::TextPool; use core::fmt; @@ -33,7 +33,7 @@ impl Default for NationGenerator<'_> { impl<'a> NationGenerator<'a> { /// Creates a new NationGenerator with default distributions and text pool /// - /// Nations does not depend on the scale factor or the vehicle number. The signature of + /// Nations does not depend on the scale factor or the part number. The signature of /// this method is provided to be consistent with the other generators, but the /// parameters are ignored. You can use [`NationGenerator::default`] to create a /// default generator. @@ -224,7 +224,7 @@ impl Default for RegionGenerator<'_> { impl<'a> RegionGenerator<'a> { /// Creates a new RegionGenerator with default distributions and text pool /// - /// Regions does not depend on the scale factor or the vehicle number. The signature of + /// Regions does not depend on the scale factor or the part number. The signature of /// this method is provided to be consistent with the other generators, but the /// parameters are ignored. You can use [`RegionGenerator::default`] to create a /// default generator. @@ -383,8 +383,8 @@ impl fmt::Display for Vehicle<'_> { #[derive(Debug, Clone)] pub struct VehicleGenerator<'a> { scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'a Distributions, text_pool: &'a TextPool, } @@ -407,12 +407,12 @@ impl<'a> VehicleGenerator<'a> { /// /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for /// more details. - pub fn new(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> VehicleGenerator<'static> { + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> VehicleGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` Self::new_with_distributions_and_text_pool( scale_factor, - vehicle, - vehicle_count, + part, + part_count, Distributions::static_default(), TextPool::get_or_init_default(), ) @@ -421,26 +421,26 @@ impl<'a> VehicleGenerator<'a> { /// Creates a VehicleGenerator with specified distributions and text pool pub fn new_with_distributions_and_text_pool<'b>( scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'b Distributions, text_pool: &'b TextPool, ) -> VehicleGenerator<'b> { VehicleGenerator { scale_factor, - vehicle, - vehicle_count, + part, + part_count, distributions, text_pool, } } - /// Return the row count for the given scale factor and generator vehicle count - pub fn calculate_row_count(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> i64 { - GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, vehicle, vehicle_count) + /// Return the row count for the given scale factor and generator part count + pub fn calculate_row_count(scale_factor: f64, part: i32, part_count: i32) -> i64 { + GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, part, part_count) } - /// Returns an iterator over the vehicle rows + /// Returns an iterator over the part rows pub fn iter(&self) -> VehicleGeneratorIterator<'a> { VehicleGeneratorIterator::new( self.distributions, @@ -448,10 +448,10 @@ impl<'a> VehicleGenerator<'a> { GenerateUtils::calculate_start_index( Self::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), - Self::calculate_row_count(self.scale_factor, self.vehicle, self.vehicle_count), + Self::calculate_row_count(self.scale_factor, self.part, self.part_count), ) } } @@ -644,8 +644,8 @@ impl fmt::Display for Driver { #[derive(Debug, Clone)] pub struct DriverGenerator<'a> { scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'a Distributions, text_pool: &'a TextPool, } @@ -676,12 +676,12 @@ impl<'a> DriverGenerator<'a> { /// /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for /// more details. - pub fn new(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> DriverGenerator<'static> { + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> DriverGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` Self::new_with_distributions_and_text_pool( scale_factor, - vehicle, - vehicle_count, + part, + part_count, Distributions::static_default(), TextPool::get_or_init_default(), ) @@ -690,23 +690,23 @@ impl<'a> DriverGenerator<'a> { /// Creates a DriverGenerator with specified distributions and text pool pub fn new_with_distributions_and_text_pool<'b>( scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'b Distributions, text_pool: &'b TextPool, ) -> DriverGenerator<'b> { DriverGenerator { scale_factor, - vehicle, - vehicle_count, + part, + part_count, distributions, text_pool, } } - /// Return the row count for the given scale factor and generator vehicle count - pub fn calculate_row_count(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> i64 { - GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, vehicle, vehicle_count) + /// Return the row count for the given scale factor and generator part count + pub fn calculate_row_count(scale_factor: f64, part: i32, part_count: i32) -> i64 { + GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, part, part_count) } /// Returns an iterator over the Driver rows @@ -717,10 +717,10 @@ impl<'a> DriverGenerator<'a> { GenerateUtils::calculate_start_index( Self::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), - Self::calculate_row_count(self.scale_factor, self.vehicle, self.vehicle_count), + Self::calculate_row_count(self.scale_factor, self.part, self.part_count), ) } } @@ -933,8 +933,8 @@ impl fmt::Display for Customer<'_> { #[derive(Debug, Clone)] pub struct CustomerGenerator<'a> { scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'a Distributions, text_pool: &'a TextPool, } @@ -953,12 +953,12 @@ impl<'a> CustomerGenerator<'a> { /// /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for /// more details. - pub fn new(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> CustomerGenerator<'static> { + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> CustomerGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` Self::new_with_distributions_and_text_pool( scale_factor, - vehicle, - vehicle_count, + part, + part_count, Distributions::static_default(), TextPool::get_or_init_default(), ) @@ -967,23 +967,23 @@ impl<'a> CustomerGenerator<'a> { /// Creates a CustomerGenerator with specified distributions and text pool pub fn new_with_distributions_and_text_pool<'b>( scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'b Distributions, text_pool: &'b TextPool, ) -> CustomerGenerator<'b> { CustomerGenerator { scale_factor, - vehicle, - vehicle_count, + part, + part_count, distributions, text_pool, } } - /// Return the row count for the given scale factor and generator vehicle count - pub fn calculate_row_count(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> i64 { - GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, vehicle, vehicle_count) + /// Return the row count for the given scale factor and generator part count + pub fn calculate_row_count(scale_factor: f64, part: i32, part_count: i32) -> i64 { + GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, part, part_count) } /// Returns an iterator over the customer rows @@ -994,10 +994,10 @@ impl<'a> CustomerGenerator<'a> { GenerateUtils::calculate_start_index( Self::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), - Self::calculate_row_count(self.scale_factor, self.vehicle, self.vehicle_count), + Self::calculate_row_count(self.scale_factor, self.part, self.part_count), ) } } @@ -1202,8 +1202,8 @@ impl fmt::Display for Order<'_> { #[derive(Debug, Clone)] pub struct OrderGenerator<'a> { scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'a Distributions, text_pool: &'a TextPool, } @@ -1230,12 +1230,12 @@ impl<'a> OrderGenerator<'a> { /// /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for /// more details. - pub fn new(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> OrderGenerator<'static> { + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> OrderGenerator<'static> { // Note: use explicit lifetime to ensure this remains `&'static` Self::new_with_distributions_and_text_pool( scale_factor, - vehicle, - vehicle_count, + part, + part_count, Distributions::static_default(), TextPool::get_or_init_default(), ) @@ -1244,23 +1244,23 @@ impl<'a> OrderGenerator<'a> { /// Creates a OrderGenerator with specified distributions and text pool pub fn new_with_distributions_and_text_pool<'b>( scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'b Distributions, text_pool: &'b TextPool, ) -> OrderGenerator<'b> { OrderGenerator { scale_factor, - vehicle, - vehicle_count, + part, + part_count, distributions, text_pool, } } - /// Return the row count for the given scale factor and generator vehicle count - pub fn calculate_row_count(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> i64 { - GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, vehicle, vehicle_count) + /// Return the row count for the given scale factor and generator part count + pub fn calculate_row_count(scale_factor: f64, part: i32, part_count: i32) -> i64 { + GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, part, part_count) } /// Returns an iterator over the order rows @@ -1272,10 +1272,10 @@ impl<'a> OrderGenerator<'a> { GenerateUtils::calculate_start_index( Self::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), - Self::calculate_row_count(self.scale_factor, self.vehicle, self.vehicle_count), + Self::calculate_row_count(self.scale_factor, self.part, self.part_count), ) } @@ -1571,8 +1571,8 @@ impl fmt::Display for LineItem<'_> { #[derive(Debug, Clone)] pub struct LineItemGenerator<'a> { scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'a Distributions, text_pool: &'a TextPool, } @@ -1601,11 +1601,11 @@ impl<'a> LineItemGenerator<'a> { /// /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for /// more details. - pub fn new(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> LineItemGenerator<'static> { + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> LineItemGenerator<'static> { Self::new_with_distributions_and_text_pool( scale_factor, - vehicle, - vehicle_count, + part, + part_count, Distributions::static_default(), TextPool::get_or_init_default(), ) @@ -1614,15 +1614,15 @@ impl<'a> LineItemGenerator<'a> { /// Creates a LineItemGenerator with specified distributions and text pool pub fn new_with_distributions_and_text_pool<'b>( scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'b Distributions, text_pool: &'b TextPool, ) -> LineItemGenerator<'b> { LineItemGenerator { scale_factor, - vehicle, - vehicle_count, + part, + part_count, distributions, text_pool, } @@ -1637,14 +1637,14 @@ impl<'a> LineItemGenerator<'a> { GenerateUtils::calculate_start_index( OrderGenerator::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), GenerateUtils::calculate_row_count( OrderGenerator::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), ) } @@ -2032,8 +2032,8 @@ impl fmt::Display for Trip { #[derive(Debug, Clone)] pub struct TripGenerator { scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: Distributions, text_pool: TextPool, distance_kde: crate::kde::DistanceKDE, @@ -2053,11 +2053,11 @@ impl TripGenerator { const TRIP_DURATION_MAX_PER_MILE: i32 = 3; // max 3 minutes per mile /// Creates a new TripGenerator with the given scale factor - pub fn new(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> TripGenerator { + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> TripGenerator { Self::new_with_distributions_and_text_pool( scale_factor, - vehicle, - vehicle_count, + part, + part_count, Distributions::static_default(), TextPool::get_or_init_default(), crate::kde::default_distance_kde(), @@ -2068,8 +2068,8 @@ impl TripGenerator { /// Creates a TripGenerator with specified distributions and text pool pub fn new_with_distributions_and_text_pool<'b>( scale_factor: f64, - vehicle: i32, - vehicle_count: i32, + part: i32, + part_count: i32, distributions: &'b Distributions, text_pool: &'b TextPool, distance_kde: crate::kde::DistanceKDE, @@ -2077,8 +2077,8 @@ impl TripGenerator { ) -> TripGenerator { TripGenerator { scale_factor, - vehicle, - vehicle_count, + part, + part_count, distributions: distributions.clone(), text_pool: text_pool.clone(), distance_kde, @@ -2086,9 +2086,9 @@ impl TripGenerator { } } - /// Return the row count for the given scale factor and generator vehicle count - pub fn calculate_row_count(scale_factor: f64, vehicle: i32, vehicle_count: i32) -> i64 { - GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, vehicle, vehicle_count) + /// Return the row count for the given scale factor and generator part count + pub fn calculate_row_count(scale_factor: f64, part: i32, part_count: i32) -> i64 { + GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, part, part_count) } /// Returns an iterator over the trip rows @@ -2100,14 +2100,14 @@ impl TripGenerator { GenerateUtils::calculate_start_index( Self::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), GenerateUtils::calculate_row_count( Self::SCALE_BASE, self.scale_factor, - self.vehicle, - self.vehicle_count, + self.part, + self.part_count, ), self.distance_kde.clone(), // Add the KDE model self.spatial_gen.clone(), @@ -2319,6 +2319,179 @@ impl<'a> Iterator for TripGeneratorIterator { } } +/// Represents a building in the dataset +#[derive(Debug, Clone, PartialEq)] +pub struct Building<'a> { + /// Unique identifier for the building + pub b_buildingkey: i64, + /// Name of the building + pub b_name: StringSequenceInstance<'a>, + /// WKT representation of the building's polygon + pub b_polygonwkt: &'a str, +} + +impl Display for Building<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}|{}|{}|", + self.b_buildingkey, + self.b_name, + self.b_polygonwkt, + ) + } +} + +/// Generator for [`Building`]s +pub struct BuildingGenerator<'a> { + scale_factor: f64, + part: i32, + part_count: i32, + distributions: &'a Distributions, + text_pool: &'a TextPool, +} + +impl<'a> BuildingGenerator<'a> { + /// Base scale for vehicle generation + const SCALE_BASE: i32 = 20_000; + const NAME_WORDS: i32 = 1; + const COMMENT_AVERAGE_LENGTH: i32 = 14; + + /// Creates a new VehicleGenerator with the given scale factor + /// + /// Note the generator's lifetime is `&'static`. See [`NationGenerator`] for + /// more details. + pub fn new(scale_factor: f64, part: i32, part_count: i32) -> BuildingGenerator<'static> { + // Note: use explicit lifetime to ensure this remains `&'static` + Self::new_with_distributions_and_text_pool( + scale_factor, + part, + part_count, + Distributions::static_default(), + TextPool::get_or_init_default(), + ) + } + + /// Creates a BuildingGenerator with specified distributions and text pool + pub fn new_with_distributions_and_text_pool<'b>( + scale_factor: f64, + part: i32, + part_count: i32, + distributions: &'b Distributions, + text_pool: &'b TextPool, + ) -> BuildingGenerator<'b> { + BuildingGenerator { + scale_factor, + part, + part_count, + distributions, + text_pool, + } + } + + /// Return the row count for the given scale factor and generator part count + pub fn calculate_row_count(scale_factor: f64, part: i32, part_count: i32) -> i64 { + GenerateUtils::calculate_row_count(Self::SCALE_BASE, scale_factor, part, part_count) + } + + /// Returns an iterator over the part rows + pub fn iter(&self) -> BuildingGeneratorIterator<'a> { + BuildingGeneratorIterator::new( + self.distributions, + self.text_pool, + GenerateUtils::calculate_start_index( + Self::SCALE_BASE, + self.scale_factor, + self.part, + self.part_count, + ), + Self::calculate_row_count(self.scale_factor, self.part, self.part_count), + ) + } +} + +impl<'a> IntoIterator for &'a BuildingGenerator<'a> { + type Item = Building<'a>; + type IntoIter = BuildingGeneratorIterator<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// Iterator that generates Building rows +#[derive(Debug)] +pub struct BuildingGeneratorIterator<'a> { + name_random: RandomStringSequence<'a>, + wkt_random: RandomText<'a>, + + start_index: i64, + row_count: i64, + index: i64, +} + +impl<'a> BuildingGeneratorIterator<'a> { + fn new( + distributions: &'a Distributions, + text_pool: &'a TextPool, + start_index: i64, + row_count: i64, + ) -> Self { + let mut name_random = RandomStringSequence::new( + 709314158, + BuildingGenerator::NAME_WORDS, + distributions.part_colors(), + ); + let mut wkt_random = RandomText::new( + 804159733, + text_pool, + BuildingGenerator::COMMENT_AVERAGE_LENGTH as f64, + ); + + // Advance all generators to the starting position + name_random.advance_rows(start_index); + wkt_random.advance_rows(start_index); + + BuildingGeneratorIterator { + name_random, + wkt_random, + start_index, + row_count, + index: 0, + } + } + + /// Creates a part with the given key + fn make_building(&mut self, building_key: i64) -> Building<'a> { + let name = self.name_random.next_value(); + + Building { + b_buildingkey: building_key, + b_name: name, + b_polygonwkt: self.wkt_random.next_value(), + } + } +} + +impl<'a> Iterator for BuildingGeneratorIterator<'a> { + type Item = Building<'a>; + + fn next(&mut self) -> Option<Self::Item> { + if self.index >= self.row_count { + return None; + } + + let building = self.make_building(self.start_index + self.index + 1); + + self.name_random.row_finished(); + self.wkt_random.row_finished(); + + self.index += 1; + + Some(building) + } +} + #[cfg(test)] mod tests { use super::*; @@ -2471,7 +2644,7 @@ mod tests { // Verify the string format matches the expected pattern let expected_pattern = format!( - "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|", + "{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|", first.t_tripkey, first.t_custkey, first.t_driverkey, @@ -2481,14 +2654,46 @@ mod tests { first.t_fare, first.t_tip, first.t_totalamount, - first.t_distance + first.t_distance, + first.t_pickupx, + first.t_pickupy, + first.t_dropoffx, + first.t_dropoffy, ); assert_eq!(first.to_string(), expected_pattern); // Check first Trip let first = &trips[1]; assert_eq!(first.t_tripkey, 2); - assert_eq!(first.to_string(), "2|851|1286|1285|1997-12-24|1997-12-24|37.00|6.00|43.00|1.40|") + assert_eq!(first.to_string(), "2|851|1286|1285|1997-12-25|1997-12-25|0.03|0.00|0.04|0.01|-102.20681068856331|34.032813907715486|-102.19307587853756|34.03497048015551|") + } + + #[test] + fn test_building_generation() { + // Create a generator with a small scale factor + let generator = BuildingGenerator::new(0.01, 1, 1); + let buildings: Vec<_> = generator.iter().collect(); + + // Should have 0.01 * 20,000 = 200 buildings + assert_eq!(buildings.len(), 200); + + // Check first building + let first = &buildings[0]; + assert_eq!(first.b_buildingkey, 1); + + // Verify the string format matches the expected pattern + let expected_pattern = format!( + "{}|{}|{}|", + first.b_buildingkey, + first.b_name, + first.b_polygonwkt, + ); + assert_eq!(first.to_string(), expected_pattern); + + // Check first Building + let first = &buildings[1]; + assert_eq!(first.b_buildingkey, 2); + assert_eq!(first.to_string(), "2|blush|lar accounts amo|") } #[test]
