Rich-T-kid commented on code in PR #22859:
URL: https://github.com/apache/datafusion/pull/22859#discussion_r3414212385
##########
datafusion/physical-plan/benches/multi_column_dictionary_group_values.rs:
##########
@@ -53,66 +50,142 @@ fn schema_for_cols(n_cols: usize) -> SchemaRef {
Arc::new(Schema::new(fields))
}
+fn count_distinct_tuples(cols: &[ArrayRef]) -> usize {
+ use std::collections::HashSet;
+ let n = cols[0].len();
+ let mut seen: HashSet<Vec<Option<u64>>> = HashSet::new();
+ for row in 0..n {
+ let key: Vec<Option<u64>> = cols
+ .iter()
+ .map(|c| {
+ let dict = c
+ .as_any()
+ .downcast_ref::<DictionaryArray<UInt64Type>>()
+ .unwrap();
+ if dict.is_null(row) {
+ None
+ } else {
+ Some(dict.keys().value(row))
+ }
+ })
+ .collect();
+ seen.insert(key);
+ }
+ seen.len()
+}
+
fn make_dict_col(
size: usize,
- num_distinct: usize,
+ group_ids: &[usize],
+ col_idx: usize,
+ per_col_card: usize,
null_density: f32,
seed: u64,
) -> ArrayRef {
- let mut rng = StdRng::seed_from_u64(seed);
-
- let strings: Vec<String> = (0..num_distinct)
+ let strings: Vec<String> = (0..per_col_card)
.map(|i| format!("dict_label_{i:012}"))
.collect();
let values = Arc::new(StringArray::from(
strings.iter().map(String::as_str).collect::<Vec<_>>(),
));
- // When the pool is at least as large as the batch, shuffle a prefix so
- // every row in this batch maps to a distinct key.
- let keys: Vec<u64> = if num_distinct >= size {
- let mut perm: Vec<u64> = (0..size as u64).collect();
- perm.shuffle(&mut rng);
- perm
- } else {
- (0..size)
- .map(|_| rng.random_range(0..num_distinct) as u64)
- .collect()
- };
+ let divisor = per_col_card.pow(col_idx as u32);
+ let keys: Vec<u64> = group_ids
+ .iter()
+ .map(|&g| ((g / divisor) % per_col_card) as u64)
+ .collect();
let keys_buf = Buffer::from_slice_ref(&keys);
let nulls: Option<NullBuffer> = (null_density > 0.0).then(|| {
+ let mut rng = StdRng::seed_from_u64(seed);
(0..size)
.map(|_| !rng.random_bool(null_density as f64))
.collect()
});
let key_array = PrimitiveArray::<UInt64Type>::new(keys_buf.into(), nulls);
Arc::new(DictionaryArray::<UInt64Type>::try_new(key_array,
values).unwrap())
+ as ArrayRef
}
+/// Each row is assigned a `group_id` (0..`target_distinct`). Column keys are
+/// derived from `group_id` via mixed-radix decomposition (treating `group_id`
+/// as a base-k number and reading off one digit per column), so rows with the
+/// same `group_id` always produce the same tuple. This keeps distinct groups
at
+/// exactly `target_distinct` regardless of column count.
fn make_batch(
n_cols: usize,
size: usize,
target_distinct: usize,
null_density: f32,
seed: u64,
) -> Vec<ArrayRef> {
- let half = CARDINALITY_RANGE / 2;
- let lo = (target_distinct * (100 - half) / 100).max(1);
- let hi = (target_distinct * (100 + half) / 100).max(lo);
let mut rng = StdRng::seed_from_u64(seed);
- (0..n_cols)
+
+ // When nulls are present all null rows coalesce into one extra group
+ // (None, None, …), so we generate one fewer non-null group to keep the
+ // total at exactly target_distinct.
+ let n_groups = if null_density > 0.0 {
+ target_distinct.saturating_sub(1).max(1)
+ } else {
+ target_distinct
+ };
+
+ let mut per_col_card = (n_groups as f64).powf(1.0 / n_cols as f64).ceil()
as usize;
+ per_col_card = per_col_card.max(1);
+ while per_col_card.saturating_pow(n_cols as u32) < n_groups {
+ per_col_card += 1;
+ }
+
+ let n_extra = size.saturating_sub(n_groups);
+ let mut group_ids: Vec<usize> = (0..n_groups.min(size)).collect();
+ group_ids.extend((0..n_extra).map(|_| rng.random_range(0..n_groups)));
+ group_ids.shuffle(&mut rng);
+
+ let cols: Vec<ArrayRef> = (0..n_cols)
+ .map(|col| make_dict_col(size, &group_ids, col, per_col_card,
null_density, seed))
+ .collect();
+
+ // run `BENCH_VALIDATE=1 cargo bench --bench
multi_column_dictionary_group_values -- --list` to validate that the generated
batches have the expected number of distinct groups
+ if std::env::var("BENCH_VALIDATE").is_ok() {
Review Comment:
Im leaving this command here because it will be very useful for debugging in
the future. Allows engineers to test if group creation logic is working as
expected
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]