gabotechs commented on code in PR #15544: URL: https://github.com/apache/datafusion/pull/15544#discussion_r2026317591
########## datafusion/sqllogictest/test_files/aggregate.slt: ########## @@ -3194,6 +3196,28 @@ select array_agg(column1) from t; statement ok drop table t; +# array_agg_ignore_nulls +statement ok +create table t as values (NULL, ''), (1, 'c'), (2, 'a'), (NULL, 'b'), (4, NULL), (NULL, NULL), (5, 'a'); + +query ? +select array_agg(column1) ignore nulls as c1 from t; +---- +[1, 2, 4, 5] + +query II +select count(*), array_length(array_agg(distinct column2) ignore nulls) from t; +---- +7 4 + +query ? +select array_agg(column2 order by column1) ignore nulls from t; +---- +[c, a, a, , b] + +statement ok +drop table t; + Review Comment: For the shake of completeness, how about adding this other test that mixes DISTINCT with ORDER BY: ```sql query ? select array_agg(DISTINCT column2 order by column2) ignore nulls from t; ---- [, a, b, c] ``` ########## datafusion/functions-aggregate/src/array_agg.rs: ########## @@ -500,11 +548,30 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { return Ok(()); } - let n_row = values[0].len(); - for index in 0..n_row { - let row = get_row_at_idx(values, index)?; - self.values.push(row[0].clone()); - self.ordering_values.push(row[1..].to_vec()); + let val = &values[0]; + let ord = &values[1..]; + let nulls = if self.ignore_nulls { + val.logical_nulls() + } else { + None + }; + + match nulls { + Some(nulls) if nulls.null_count() >= val.len() => (), + Some(nulls) => { + for i in 0..val.len() { + if nulls.is_valid(i) { + self.values.push(ScalarValue::try_from_array(val, i)?); + self.ordering_values.push(get_row_at_idx(ord, i)?) + } + } + } + None => { + for i in 0..val.len() { + self.values.push(ScalarValue::try_from_array(val, i)?); + self.ordering_values.push(get_row_at_idx(ord, i)?) + } + } } Review Comment: 🤔 I wonder if there is a way of avoiding some of the repetition here, I can think of something like ```rust for i in 0..val.len() { if nulls.as_ref().is_none_or(|n| n.is_valid(i)) { self.values.push(ScalarValue::try_from_array(val, i)?); self.ordering_values.push(get_row_at_idx(ord, i)?) } } ``` but you'd lose the `ulls.null_count() >= val.len()` case optimization. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org