Jefffrey commented on code in PR #20182:
URL: https://github.com/apache/datafusion/pull/20182#discussion_r2778425679
##########
datafusion/common/benches/with_hashes.rs:
##########
@@ -47,50 +51,75 @@ fn criterion_benchmark(c: &mut Criterion) {
BenchData {
name: "int64",
array: primitive_array::<Int64Type>(BATCH_SIZE),
+ supports_nulls: true,
},
BenchData {
name: "utf8",
array: pool.string_array::<i32>(BATCH_SIZE),
+ supports_nulls: true,
},
BenchData {
name: "large_utf8",
array: pool.string_array::<i64>(BATCH_SIZE),
+ supports_nulls: true,
},
BenchData {
name: "utf8_view",
array: pool.string_view_array(BATCH_SIZE),
+ supports_nulls: true,
},
BenchData {
name: "utf8_view (small)",
array: small_pool.string_view_array(BATCH_SIZE),
+ supports_nulls: true,
},
BenchData {
name: "dictionary_utf8_int32",
array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
+ supports_nulls: true,
+ },
+ BenchData {
+ name: "struct_array",
+ array: create_struct_array(BATCH_SIZE),
+ supports_nulls: true,
+ },
+ BenchData {
+ name: "run_array_int32",
+ array: create_run_array::<Int32Type>(BATCH_SIZE),
+ supports_nulls: true,
},
];
- for BenchData { name, array } in cases {
- // with_hash has different code paths for single vs multiple arrays
and nulls vs no nulls
- let nullable_array = add_nulls(&array);
+ for BenchData {
+ name,
+ array,
+ supports_nulls,
+ } in cases
+ {
c.bench_function(&format!("{name}: single, no nulls"), |b| {
do_hash_test(b, std::slice::from_ref(&array));
});
- c.bench_function(&format!("{name}: single, nulls"), |b| {
- do_hash_test(b, std::slice::from_ref(&nullable_array));
- });
c.bench_function(&format!("{name}: multiple, no nulls"), |b| {
let arrays = vec![array.clone(), array.clone(), array.clone()];
do_hash_test(b, &arrays);
});
- c.bench_function(&format!("{name}: multiple, nulls"), |b| {
- let arrays = vec![
- nullable_array.clone(),
- nullable_array.clone(),
- nullable_array.clone(),
- ];
- do_hash_test(b, &arrays);
- });
+
+ // Union arrays can't have null bitmasks
Review Comment:
Mentioning union array when we don't implement that here?
##########
datafusion/common/benches/with_hashes.rs:
##########
@@ -122,16 +151,51 @@ where
builder.finish().expect("should be nulls in buffer")
}
-// Returns an new array that is the same as array, but with nulls
+// Returns a new array that is the same as array, but with nulls
+// Handles the special case of RunArray where nulls must be in the values array
fn add_nulls(array: &ArrayRef) -> ArrayRef {
- let array_data = array
- .clone()
- .into_data()
- .into_builder()
- .nulls(Some(create_null_mask(array.len())))
- .build()
- .unwrap();
- make_array(array_data)
+ use arrow::datatypes::DataType;
+
+ match array.data_type() {
+ DataType::RunEndEncoded(_, _) => {
+ // RunArray can't have top-level nulls, so apply nulls to the
values array
+ let run_array = array
+ .as_any()
+ .downcast_ref::<RunArray<Int32Type>>()
+ .expect("Expected RunArray");
+
+ let run_ends_buffer = run_array.run_ends().inner().clone();
+ let run_ends_array =
PrimitiveArray::<Int32Type>::new(run_ends_buffer, None);
+ let values = run_array.values().clone();
+
+ // Add nulls to the values array
+ let values_with_nulls = {
+ let array_data = values
+ .clone()
+ .into_data()
+ .into_builder()
+ .nulls(Some(create_null_mask(values.len())))
Review Comment:
Something to think about is how null density acts differently here for run
arrays, since we'd apply null on entire runs 🤔
##########
datafusion/common/benches/with_hashes.rs:
##########
@@ -205,5 +269,79 @@ where
Arc::new(array)
}
+fn boolean_array(array_len: usize) -> ArrayRef {
+ let mut rng = make_rng();
+ Arc::new(
+ (0..array_len)
+ .map(|_| Some(rng.random::<bool>()))
+ .collect::<arrow::array::BooleanArray>(),
+ )
+}
+
+fn string_array(array_len: usize) -> ArrayRef {
Review Comment:
Do we need this if we already have `StringPool` above?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]