Jefffrey commented on code in PR #20182:
URL: https://github.com/apache/datafusion/pull/20182#discussion_r2777729730
##########
datafusion/common/benches/with_hashes.rs:
##########
@@ -205,5 +222,123 @@ where
Arc::new(array)
}
+/// Create a StructArray with multiple columns
+fn create_struct_array(array_len: usize) -> ArrayRef {
+ let mut rng = make_rng();
+
+ // Create 4 columns of different types for our struct array
+ let bool_array: ArrayRef = Arc::new(
+ (0..array_len)
+ .map(|_| Some(rng.random::<bool>()))
+ .collect::<arrow::array::BooleanArray>(),
+ );
+
+ let int32_array: ArrayRef = Arc::new(
Review Comment:
Maybe we could reuse the existing functions above to create these random
arrays? e.g. `primitive_array()`
The only difference I see is that it uses its own `rng`; is this a big
concern, considering each of these functions currently use their own `rng`
anyway?
##########
datafusion/common/benches/with_hashes.rs:
##########
@@ -68,11 +71,25 @@ fn criterion_benchmark(c: &mut Criterion) {
name: "dictionary_utf8_int32",
array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
},
+ BenchData {
+ name: "struct_array",
+ array: create_struct_array(BATCH_SIZE),
+ },
+ BenchData {
+ name: "run_array_int32",
+ array: create_run_array::<Int32Type>(BATCH_SIZE),
+ },
];
for BenchData { name, array } in cases {
// with_hash has different code paths for single vs multiple arrays
and nulls vs no nulls
- let nullable_array = add_nulls(&array);
+ // RunArray encodes nulls in the values array, not at the array level
+ let nullable_array = if name.starts_with("run_array") {
Review Comment:
Maybe we should take the approach by #20179 to have this as an explicit
property instead of checking by name
##########
datafusion/common/benches/with_hashes.rs:
##########
@@ -205,5 +222,123 @@ where
Arc::new(array)
}
+/// Create a StructArray with multiple columns
+fn create_struct_array(array_len: usize) -> ArrayRef {
+ let mut rng = make_rng();
+
+ // Create 4 columns of different types for our struct array
+ let bool_array: ArrayRef = Arc::new(
+ (0..array_len)
+ .map(|_| Some(rng.random::<bool>()))
+ .collect::<arrow::array::BooleanArray>(),
+ );
+
+ let int32_array: ArrayRef = Arc::new(
+ (0..array_len)
+ .map(|_| Some(rng.random::<i32>()))
+ .collect::<PrimitiveArray<Int32Type>>(),
+ );
+
+ let int64_array: ArrayRef = Arc::new(
+ (0..array_len)
+ .map(|_| Some(rng.random::<i64>()))
+ .collect::<PrimitiveArray<Int64Type>>(),
+ );
+
+ let string_array: ArrayRef = {
+ let strings: Vec<String> = (0..array_len)
+ .map(|_| {
+ let len = rng.random_range(5..20);
+ let value: Vec<u8> =
+ rng.clone().sample_iter(&Alphanumeric).take(len).collect();
+ String::from_utf8(value).unwrap()
+ })
+ .collect();
+ Arc::new(StringArray::from(strings))
+ };
+
+ let fields = Fields::from(vec![
+ Field::new("bool_col", DataType::Boolean, false),
+ Field::new("int32_col", DataType::Int32, false),
+ Field::new("int64_col", DataType::Int64, false),
+ Field::new("string_col", DataType::Utf8, false),
+ ]);
+
+ Arc::new(StructArray::new(
+ fields,
+ vec![bool_array, int32_array, int64_array, string_array],
+ None,
+ ))
+}
+
+/// Create a RunArray to test run array hashing.
+fn create_run_array<T>(array_len: usize) -> ArrayRef
+where
+ T: ArrowPrimitiveType,
+ StandardUniform: Distribution<T::Native>,
+{
+ let mut rng = make_rng();
+
+ // Create runs of varying lengths
+ let mut run_ends = Vec::new();
+ let mut values = Vec::new();
+ let mut current_end = 0;
+
+ while current_end < array_len {
+ // Random run length between 1 and 50
+ let run_length = rng.random_range(1..=50).min(array_len - current_end);
+ current_end += run_length;
+ run_ends.push(current_end as i32);
+ values.push(Some(rng.random::<T::Native>()));
+ }
+
+ let run_ends_array = Arc::new(PrimitiveArray::<Int32Type>::from(run_ends));
+ let values_array: Arc<dyn Array> =
+ Arc::new(values.into_iter().collect::<PrimitiveArray<T>>());
+
+ Arc::new(
+ RunArray::try_new(&run_ends_array, values_array.as_ref())
+ .expect("Failed to create RunArray"),
+ )
+}
+
+/// Create a RunArray with null values
+fn create_run_array_with_null_values<T>(array_len: usize) -> ArrayRef
Review Comment:
I feel we can reduce the duplication with above function here if the only
difference is nulls 🤔
It would just be a matter of calling `add_nulls()` on the values array
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]