guysv2 opened a new issue, #17256:
URL: https://github.com/apache/datafusion/issues/17256

   ### Describe the bug
   
   It appears datafusion DISTINCT ON support is currently limited for queries 
that use:
   * GROUP BY
   * Aggregation functions
   * Window functions
   
   I was wondering how far is the implementation currently from supporting 
aggregations and GROUP BYs?
   
   ### To Reproduce
   
   Here's a currently failing test:
   ```diff
   diff --git a/datafusion/core/tests/sql/select.rs 
b/datafusion/core/tests/sql/select.rs
   index 0e1210ebb..529cd693c 100644
   --- a/datafusion/core/tests/sql/select.rs
   +++ b/datafusion/core/tests/sql/select.rs
   @@ -50,6 +50,38 @@ async fn test_list_query_parameters() -> Result<()> {
        Ok(())
    }
    
   +#[tokio::test]
   +async fn test_distict_on_with_aggregation() -> Result<()> {
   +    let tmp_dir = TempDir::new()?;
   +    let partition_count = 4;
   +    let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?;
   +
   +    let results = ctx
   +        .sql("
   +SELECT DISTINCT ON(c1)
   +    c1,
   +    c3,
   +    SUM(c2) AS c2_total
   +FROM test
   +GROUP BY c1, c3
   +ORDER BY c1, c2_total DESC
   +")
   +        .await?
   +        .collect()
   +        .await?;
   +    assert_snapshot!(batches_to_sort_string(&results), @r"
   +    +----+------+----------+
   +    | c1 | c3   | c2_total |
   +    +----+------+----------+
   +    | 0  | true | 30       |
   +    | 1  | true | 30       |
   +    | 2  | true | 30       |
   +    | 3  | true | 30       |
   +    +----+------+----------+
   +    ");
   +    Ok(())
   +}
   +
    #[tokio::test]
    async fn test_named_query_parameters() -> Result<()> {
        let tmp_dir = TempDir::new()?;
   ```
   
   ### Expected behavior
   
   _No response_
   
   ### Additional context
   
   looking at the code, it looks like all that's left to support it is passing 
the post-aggregation plan into `distict_on()`?
   
   here's a solution I sketched. after applying the patch, the test earlier now 
pass:
   ```diff
   diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
   index c7ccda723..d5dedbd82 100644
   --- a/datafusion/sql/src/select.rs
   +++ b/datafusion/sql/src/select.rs
   @@ -29,7 +29,7 @@ use crate::utils::{
    
    use datafusion_common::error::DataFusionErrorBuilder;
    use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
   -use datafusion_common::{not_impl_err, plan_err, Result};
   +use datafusion_common::{not_impl_err, plan_err, Column, Result};
    use datafusion_common::{RecursionUnnestOption, UnnestOptions};
    use datafusion_expr::expr::{Alias, PlannedReplaceSelectItem, 
WildcardOptions};
    use datafusion_expr::expr_rewriter::{
   @@ -314,24 +314,44 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                    LogicalPlanBuilder::from(plan).distinct()?.build()
                }
                Some(Distinct::On(on_expr)) => {
   -                if !aggr_exprs.is_empty()
   -                    || !group_by_exprs.is_empty()
   -                    || !window_func_exprs.is_empty()
   -                {
   -                    return not_impl_err!("DISTINCT ON expressions with 
GROUP BY, aggregation or window functions are not supported ");
   +                if !window_func_exprs.is_empty() {
   +                    return not_impl_err!("DISTINCT ON expressions with 
window functions are not supported ");
   +                }
   +                // For DISTINCT ON with aggregation, we need to apply it 
after aggregation
   +                if !aggr_exprs.is_empty() || !group_by_exprs.is_empty() {
   +                    // Apply DISTINCT ON after aggregation
   +                    let on_expr = on_expr
   +                        .into_iter()
   +                        .map(|e| {
   +                            self.sql_expr_to_logical_expr(e, plan.schema(), 
planner_context)
   +                        })
   +                        .collect::<Result<Vec<_>>>()?;
   +
   +                    // Create expressions from the current plan schema for 
DISTINCT ON
   +                    let distinct_select_exprs = plan.schema()
   +                        .fields()
   +                        .iter()
   +                        .map(|field| 
Expr::Column(Column::from_name(field.name())))
   +                        .collect::<Vec<_>>();
   +
   +                    // Use the current plan (which includes aggregation) 
and apply DISTINCT ON
   +                    LogicalPlanBuilder::from(plan)
   +                        .distinct_on(on_expr, distinct_select_exprs, None)?
   +                        .build()
   +                } else {
   +                    // No aggregation, apply DISTINCT ON to base plan
   +                    let on_expr = on_expr
   +                        .into_iter()
   +                        .map(|e| {
   +                            self.sql_expr_to_logical_expr(e, plan.schema(), 
planner_context)
   +                        })
   +                        .collect::<Result<Vec<_>>>()?;
   +
   +                    // Build the final plan
   +                    LogicalPlanBuilder::from(base_plan)
   +                        .distinct_on(on_expr, select_exprs, None)?
   +                        .build()
                    }
   -
   -                let on_expr = on_expr
   -                    .into_iter()
   -                    .map(|e| {
   -                        self.sql_expr_to_logical_expr(e, plan.schema(), 
planner_context)
   -                    })
   -                    .collect::<Result<Vec<_>>>()?;
   -
   -                // Build the final plan
   -                LogicalPlanBuilder::from(base_plan)
   -                    .distinct_on(on_expr, select_exprs, None)?
   -                    .build()
                }
            }?;
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to