Copilot commented on code in PR #17:
URL:
https://github.com/apache/sedona-spatialbench/pull/17#discussion_r2361891876
##########
spatialbench-cli/tests/cli_integration.rs:
##########
@@ -184,6 +185,100 @@ async fn test_write_parquet_trips() {
}
}
+#[tokio::test]
+async fn test_write_parquet_row_group_size_default() {
+ // Run the CLI command to generate parquet data with default settings
+ let output_dir = tempdir().unwrap();
+ Command::cargo_bin("spatialbench-cli")
+ .expect("Binary not found")
+ .arg("--format")
+ .arg("parquet")
+ .arg("--scale-factor")
+ .arg("1")
+ .arg("--tables")
+ .arg("trip,driver,vehicle,customer,building")
+ .arg("--output-dir")
+ .arg(output_dir.path())
+ .assert()
+ .success();
+
+ expect_row_group_sizes(
+ output_dir.path(),
+ vec![
+ RowGroups {
+ table: "customer",
+ row_group_bytes: vec![2600113],
+ },
+ RowGroups {
+ table: "trip",
+ row_group_bytes: vec![123519959, 123486809, 123476361,
123492237],
+ },
+ RowGroups {
+ table: "driver",
+ row_group_bytes: vec![41594],
+ },
+ RowGroups {
+ table: "vehicle",
+ row_group_bytes: vec![5393],
+ },
+ RowGroups {
+ table: "building",
+ row_group_bytes: vec![2492865],
+ },
+ ],
+ );
+}
+
+#[tokio::test]
+async fn test_write_parquet_row_group_size_20mb() {
+ // Run the CLI command to generate parquet data with larger row group size
Review Comment:
The comment says 'larger row group size' but 20MB is smaller than the
default 128MB target. Update wording to 'smaller row group size' (or adjust the
test to actually use a larger size) to prevent confusion.
```suggestion
// Run the CLI command to generate parquet data with smaller row group
size (20MB vs default 128MB)
```
##########
spatialbench-cli/src/main.rs:
##########
@@ -19,6 +19,7 @@
//! --part <N> Which part to generate (1-based, default:
1)
//! -n, --num-threads <N> Number of threads to use (default: number
of CPUs)
//! -c, --parquet-compression <C> Parquet compression codec, e.g., SNAPPY,
ZSTD(1), UNCOMPRESSED (default: SNAPPY)
+//! --parquet-row-group-size <N> Number of rows per row group in
Parquet files (default: 134,217,728)
Review Comment:
Flag name and semantics are inconsistent: help text documents
--parquet-row-group-size and says 'Number of rows', but the implemented flag is
--parquet-row-group-bytes and the value represents bytes, not rows. Align by
updating this line to use --parquet-row-group-bytes and clarify it is a target
size in bytes.
```suggestion
//! --parquet-row-group-bytes <N> Target size in bytes per row group
in Parquet files (default: 134,217,728)
```
##########
spatialbench-cli/src/plan.rs:
##########
@@ -58,18 +58,22 @@ pub struct GenerationPlan {
part_list: RangeInclusive<i32>,
}
+pub const DEFAULT_PARQUET_ROW_GROUP_BYTES: i64 = 128 * 1024 * 1024;
+
impl GenerationPlan {
/// Returns a GenerationPlan number of parts to generate
///
/// # Arguments
/// * `cli_part`: optional part number to generate (1-based), `--part` CLI
argument
/// * `cli_part_count`: optional total number of parts, `--parts` CLI
argument
+ /// * `parquet_row_group_size`: optional parquet row group size,
`--parquet-row-group-size` CLI argument
pub fn try_new(
table: &Table,
format: OutputFormat,
scale_factor: f64,
cli_part: Option<i32>,
cli_part_count: Option<i32>,
+ parquet_row_group_bytes: i64,
Review Comment:
Documentation refers to parquet_row_group_size and --parquet-row-group-size,
but the parameter and actual CLI flag use parquet_row_group_bytes /
--parquet-row-group-bytes. Rename in the doc comment (and everywhere) to
'parquet_row_group_bytes' and reference the actual flag to avoid user confusion.
##########
spatialbench-cli/tests/cli_integration.rs:
##########
@@ -277,6 +372,36 @@ fn test_spatialbench_cli_zero_part_zero_parts() {
));
}
+/// Test specifying parquet options even when writing tbl output
+#[tokio::test]
+async fn test_incompatible_options_warnings() {
+ let output_dir = tempdir().unwrap();
+ Command::cargo_bin("spatialbench-cli")
+ .expect("Binary not found")
+ .arg("--format")
+ .arg("csv")
+ .arg("--tables")
+ .arg("trip")
+ .arg("--scale-factor")
+ .arg("0.0001")
+ .arg("--output-dir")
+ .arg(output_dir.path())
+ // pass in parquet options that are incompatible with csv
+ .arg("--parquet-compression")
+ .arg("zstd(1)")
+ .arg("--parquet-row-group-bytes")
+ .arg("8192")
+ .assert()
+ // still success, but should see warnints
Review Comment:
Typo: 'warnints' should be 'warnings'.
```suggestion
// still success, but should see warnings
```
##########
spatialbench-cli/src/plan.rs:
##########
@@ -58,18 +58,22 @@ pub struct GenerationPlan {
part_list: RangeInclusive<i32>,
}
+pub const DEFAULT_PARQUET_ROW_GROUP_BYTES: i64 = 128 * 1024 * 1024;
+
impl GenerationPlan {
/// Returns a GenerationPlan number of parts to generate
///
/// # Arguments
/// * `cli_part`: optional part number to generate (1-based), `--part` CLI
argument
/// * `cli_part_count`: optional total number of parts, `--parts` CLI
argument
+ /// * `parquet_row_group_size`: optional parquet row group size,
`--parquet-row-group-size` CLI argument
Review Comment:
Documentation refers to parquet_row_group_size and --parquet-row-group-size,
but the parameter and actual CLI flag use parquet_row_group_bytes /
--parquet-row-group-bytes. Rename in the doc comment (and everywhere) to
'parquet_row_group_bytes' and reference the actual flag to avoid user confusion.
```suggestion
/// * `parquet_row_group_bytes`: optional parquet row group size in
bytes, `--parquet-row-group-bytes` CLI argument
```
##########
spatialbench-cli/tests/cli_integration.rs:
##########
@@ -184,6 +185,100 @@ async fn test_write_parquet_trips() {
}
}
+#[tokio::test]
+async fn test_write_parquet_row_group_size_default() {
+ // Run the CLI command to generate parquet data with default settings
+ let output_dir = tempdir().unwrap();
+ Command::cargo_bin("spatialbench-cli")
+ .expect("Binary not found")
+ .arg("--format")
+ .arg("parquet")
+ .arg("--scale-factor")
+ .arg("1")
+ .arg("--tables")
+ .arg("trip,driver,vehicle,customer,building")
+ .arg("--output-dir")
+ .arg(output_dir.path())
+ .assert()
+ .success();
+
+ expect_row_group_sizes(
+ output_dir.path(),
+ vec![
+ RowGroups {
+ table: "customer",
+ row_group_bytes: vec![2600113],
+ },
+ RowGroups {
+ table: "trip",
+ row_group_bytes: vec![123519959, 123486809, 123476361,
123492237],
+ },
+ RowGroups {
+ table: "driver",
+ row_group_bytes: vec![41594],
+ },
+ RowGroups {
+ table: "vehicle",
+ row_group_bytes: vec![5393],
+ },
+ RowGroups {
+ table: "building",
+ row_group_bytes: vec![2492865],
+ },
+ ],
+ );
+}
+
+#[tokio::test]
+async fn test_write_parquet_row_group_size_20mb() {
+ // Run the CLI command to generate parquet data with larger row group size
+ let output_dir = tempdir().unwrap();
+ Command::cargo_bin("spatialbench-cli")
+ .expect("Binary not found")
+ .arg("--format")
+ .arg("parquet")
+ .arg("--scale-factor")
+ .arg("1")
+ .arg("--tables")
+ .arg("trip,driver,vehicle,customer,building")
+ .arg("--output-dir")
+ .arg(output_dir.path())
+ .arg("--parquet-row-group-bytes")
+ .arg("20000000") // 20 MB
+ .assert()
+ .success();
+
+ expect_row_group_sizes(
+ output_dir.path(),
+ vec![
+ RowGroups {
+ table: "customer",
+ row_group_bytes: vec![2600113],
+ },
+ RowGroups {
+ table: "trip",
+ row_group_bytes: vec![
+ 24361422, 24361685, 24350928, 24348682, 24353605,
24335813, 24358941, 24343011,
+ 24345967, 24361312, 24337627, 24345972, 24348724,
24361400, 24361528, 24346264,
+ 24351137, 24338412, 24348304, 24361680, 24351433,
+ ],
+ },
+ RowGroups {
+ table: "driver",
+ row_group_bytes: vec![41594],
+ },
+ RowGroups {
+ table: "vehicle",
+ row_group_bytes: vec![5393],
+ },
+ RowGroups {
+ table: "building",
+ row_group_bytes: vec![2492865],
+ },
+ ],
+ );
+}
+
Review Comment:
[nitpick] Asserting exact row group byte sizes makes the tests brittle
(minor upstream changes in encoding, library version, or metadata ordering
could cause spurious failures). Consider asserting number of row groups and
that each size is within an expected tolerance (e.g. ±5%) of the target, or
only that no group exceeds the configured target size.
```suggestion
// Instead of asserting exact row group sizes, check that the number of
row groups is as expected,
// and that each row group is within ±5% of the target size (20 MB =
20_000_000 bytes).
expect_row_group_sizes_within_tolerance(
output_dir.path(),
vec![
("customer", 1),
("trip", 21),
("driver", 1),
("vehicle", 1),
("building", 1),
],
20_000_000,
0.05, // 5% tolerance
);
}
/// Helper function to check that each table's Parquet file has the expected
number of row groups,
/// and that each row group size is within the given tolerance of the target
size.
fn expect_row_group_sizes_within_tolerance<P: AsRef<Path>>(
output_dir: P,
expected: Vec<(&str, usize)>,
target_size: usize,
tolerance: f64,
) {
use parquet::file::reader::{FileReader, SerializedFileReader};
for (table, expected_num_row_groups) in expected {
let file_path = output_dir.as_ref().join(format!("{}.parquet",
table));
let file = File::open(&file_path)
.unwrap_or_else(|_| panic!("Failed to open parquet file for
table {}", table));
let reader = SerializedFileReader::new(file).expect("Failed to
create Parquet reader");
let metadata = reader.metadata();
let row_groups = metadata.row_groups();
assert_eq!(
row_groups.len(),
expected_num_row_groups,
"Table '{}' expected {} row groups, found {}",
table,
expected_num_row_groups,
row_groups.len()
);
let lower = (target_size as f64 * (1.0 - tolerance)) as usize;
let upper = (target_size as f64 * (1.0 + tolerance)) as usize;
for (i, rg) in row_groups.iter().enumerate() {
let size = rg.total_byte_size() as usize;
// Allow smaller row groups for small tables (e.g., customer,
driver, etc.)
if expected_num_row_groups == 1 {
// Just check it's nonzero
assert!(
size > 0,
"Table '{}' row group {} has zero size",
table,
i
);
} else {
assert!(
size >= lower && size <= upper,
"Table '{}' row group {} size {} not within ±{:.0}% of
target {} (allowed: {}-{})",
table,
i,
size,
tolerance * 100.0,
target_size,
lower,
upper
);
}
}
}
}
```
##########
spatialbench-cli/tests/cli_integration.rs:
##########
@@ -299,3 +424,41 @@ fn read_reference_file(table_name: &str, scale_factor:
&str) -> String {
}
}
}
+
+#[derive(Debug, PartialEq)]
+struct RowGroups {
+ table: &'static str,
+ /// total bytes in each row group
+ row_group_bytes: Vec<i64>,
+}
+
+/// For each table in tables, check that the parquet file in output_dir has
+/// a file with the expected row group sizes.
+fn expect_row_group_sizes(output_dir: &Path, expected_row_groups:
Vec<RowGroups>) {
+ let mut actual_row_groups = vec![];
+ for table in &expected_row_groups {
+ let output_path = output_dir.join(format!("{}.parquet", table.table));
+ assert!(
+ output_path.exists(),
+ "Expected parquet file {:?} to exist",
+ output_path
+ );
+ // read the metadata to get the row group size
+ let file = File::open(&output_path).expect("Failed to open parquet
file");
+ let mut metadata_reader = ParquetMetaDataReader::new();
+ metadata_reader.try_parse(&file).unwrap();
+ let metadata = metadata_reader.finish().unwrap();
+ let row_groups = metadata.row_groups();
+ let actual_row_group_bytes: Vec<_> =
+ row_groups.iter().map(|rg| rg.total_byte_size()).collect();
+ actual_row_groups.push(RowGroups {
+ table: table.table,
+ row_group_bytes: actual_row_group_bytes,
+ })
+ }
+ // compare the expected and actual row groups debug print actual on failure
+ // for better output / easier comparison
+ let expected_row_groups = format!("{expected_row_groups:#?}");
+ let actual_row_groups = format!("{actual_row_groups:#?}");
+ assert_eq!(actual_row_groups, expected_row_groups);
Review Comment:
[nitpick] Comparing formatted strings loses structural type comparison and
reverses the typical expected/actual ordering; compare the Vec<RowGroups>
directly and use an assert message for diagnostics:
assert_eq!(actual_row_groups, expected_row_groups, \"Mismatch: actual={:#?}\",
actual_row_groups).
```suggestion
assert_eq!(
actual_row_groups,
expected_row_groups,
"Mismatch: actual={:#?}",
actual_row_groups
);
```
##########
spatialbench-cli/tests/cli_integration.rs:
##########
@@ -184,6 +185,100 @@ async fn test_write_parquet_trips() {
}
}
+#[tokio::test]
+async fn test_write_parquet_row_group_size_default() {
+ // Run the CLI command to generate parquet data with default settings
+ let output_dir = tempdir().unwrap();
+ Command::cargo_bin("spatialbench-cli")
+ .expect("Binary not found")
+ .arg("--format")
+ .arg("parquet")
+ .arg("--scale-factor")
+ .arg("1")
+ .arg("--tables")
+ .arg("trip,driver,vehicle,customer,building")
+ .arg("--output-dir")
+ .arg(output_dir.path())
+ .assert()
+ .success();
+
+ expect_row_group_sizes(
+ output_dir.path(),
+ vec![
+ RowGroups {
+ table: "customer",
+ row_group_bytes: vec![2600113],
+ },
+ RowGroups {
+ table: "trip",
+ row_group_bytes: vec![123519959, 123486809, 123476361,
123492237],
+ },
+ RowGroups {
+ table: "driver",
+ row_group_bytes: vec![41594],
+ },
+ RowGroups {
+ table: "vehicle",
+ row_group_bytes: vec![5393],
+ },
+ RowGroups {
+ table: "building",
+ row_group_bytes: vec![2492865],
Review Comment:
[nitpick] Asserting exact row group byte sizes makes the tests brittle
(minor upstream changes in encoding, library version, or metadata ordering
could cause spurious failures). Consider asserting number of row groups and
that each size is within an expected tolerance (e.g. ±5%) of the target, or
only that no group exceeds the configured target size.
```suggestion
// Instead of asserting exact row group sizes, check number of row
groups and that each is within ±5% of the target size.
expect_row_group_sizes_with_tolerance(
output_dir.path(),
vec![
RowGroupSpec {
table: "customer",
expected_num_row_groups: 1,
target_row_group_size: 2_600_000, // bytes
tolerance: 0.05,
},
RowGroupSpec {
table: "trip",
expected_num_row_groups: 4,
target_row_group_size: 123_500_000, // bytes
tolerance: 0.05,
},
RowGroupSpec {
table: "driver",
expected_num_row_groups: 1,
target_row_group_size: 42_000, // bytes
tolerance: 0.05,
},
RowGroupSpec {
table: "vehicle",
expected_num_row_groups: 1,
target_row_group_size: 5_400, // bytes
tolerance: 0.05,
},
RowGroupSpec {
table: "building",
expected_num_row_groups: 1,
target_row_group_size: 2_500_000, // bytes
tolerance: 0.05,
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]