guysv2 opened a new issue, #17256: URL: https://github.com/apache/datafusion/issues/17256
### Describe the bug It appears datafusion DISTINCT ON support is currently limited for queries that use: * GROUP BY * Aggregation functions * Window functions I was wondering how far is the implementation currently from supporting aggregations and GROUP BYs? ### To Reproduce Here's a currently failing test: ```diff diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 0e1210ebb..529cd693c 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -50,6 +50,38 @@ async fn test_list_query_parameters() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_distict_on_with_aggregation() -> Result<()> { + let tmp_dir = TempDir::new()?; + let partition_count = 4; + let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?; + + let results = ctx + .sql(" +SELECT DISTINCT ON(c1) + c1, + c3, + SUM(c2) AS c2_total +FROM test +GROUP BY c1, c3 +ORDER BY c1, c2_total DESC +") + .await? + .collect() + .await?; + assert_snapshot!(batches_to_sort_string(&results), @r" + +----+------+----------+ + | c1 | c3 | c2_total | + +----+------+----------+ + | 0 | true | 30 | + | 1 | true | 30 | + | 2 | true | 30 | + | 3 | true | 30 | + +----+------+----------+ + "); + Ok(()) +} + #[tokio::test] async fn test_named_query_parameters() -> Result<()> { let tmp_dir = TempDir::new()?; ``` ### Expected behavior _No response_ ### Additional context looking at the code, it looks like all that's left to support it is passing the post-aggregation plan into `distict_on()`? here's a solution I sketched. after applying the patch, the test earlier now pass: ```diff diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index c7ccda723..d5dedbd82 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -29,7 +29,7 @@ use crate::utils::{ use datafusion_common::error::DataFusionErrorBuilder; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::{not_impl_err, plan_err, Result}; +use datafusion_common::{not_impl_err, plan_err, Column, Result}; use datafusion_common::{RecursionUnnestOption, UnnestOptions}; use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, WildcardOptions}; use datafusion_expr::expr_rewriter::{ @@ -314,24 +314,44 @@ impl<S: ContextProvider> SqlToRel<'_, S> { LogicalPlanBuilder::from(plan).distinct()?.build() } Some(Distinct::On(on_expr)) => { - if !aggr_exprs.is_empty() - || !group_by_exprs.is_empty() - || !window_func_exprs.is_empty() - { - return not_impl_err!("DISTINCT ON expressions with GROUP BY, aggregation or window functions are not supported "); + if !window_func_exprs.is_empty() { + return not_impl_err!("DISTINCT ON expressions with window functions are not supported "); + } + // For DISTINCT ON with aggregation, we need to apply it after aggregation + if !aggr_exprs.is_empty() || !group_by_exprs.is_empty() { + // Apply DISTINCT ON after aggregation + let on_expr = on_expr + .into_iter() + .map(|e| { + self.sql_expr_to_logical_expr(e, plan.schema(), planner_context) + }) + .collect::<Result<Vec<_>>>()?; + + // Create expressions from the current plan schema for DISTINCT ON + let distinct_select_exprs = plan.schema() + .fields() + .iter() + .map(|field| Expr::Column(Column::from_name(field.name()))) + .collect::<Vec<_>>(); + + // Use the current plan (which includes aggregation) and apply DISTINCT ON + LogicalPlanBuilder::from(plan) + .distinct_on(on_expr, distinct_select_exprs, None)? + .build() + } else { + // No aggregation, apply DISTINCT ON to base plan + let on_expr = on_expr + .into_iter() + .map(|e| { + self.sql_expr_to_logical_expr(e, plan.schema(), planner_context) + }) + .collect::<Result<Vec<_>>>()?; + + // Build the final plan + LogicalPlanBuilder::from(base_plan) + .distinct_on(on_expr, select_exprs, None)? + .build() } - - let on_expr = on_expr - .into_iter() - .map(|e| { - self.sql_expr_to_logical_expr(e, plan.schema(), planner_context) - }) - .collect::<Result<Vec<_>>>()?; - - // Build the final plan - LogicalPlanBuilder::from(base_plan) - .distinct_on(on_expr, select_exprs, None)? - .build() } }?; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org