alamb commented on code in PR #14673: URL: https://github.com/apache/datafusion/pull/14673#discussion_r1957081251
########## datafusion/physical-optimizer/src/enforce_sorting/mod.rs: ########## @@ -151,10 +163,15 @@ fn update_coalesce_ctx_children( }; } -/// The boolean flag `repartition_sorts` defined in the config indicates -/// whether we elect to transform [`CoalescePartitionsExec`] + [`SortExec`] cascades -/// into [`SortExec`] + [`SortPreservingMergeExec`] cascades, which enables us to -/// perform sorting in parallel. +/// Performs optimizations based upon a series of subrules. Review Comment: π ########## datafusion/physical-optimizer/src/enforce_sorting/mod.rs: ########## @@ -243,20 +260,66 @@ fn replace_with_partial_sort( Ok(plan) } -/// This function turns plans of the form +/// Transform [`CoalescePartitionsExec`] + [`SortExec`] into +/// [`SortExec`] + [`SortPreservingMergeExec`] as illustrated below: +/// +/// The [`CoalescePartitionsExec`] + [`SortExec`] cascades +/// combine the partitions first, and then sort: +/// ```text +/// β β β β β β β +/// βββ¬ββ¬ββ +/// ββBβAβDβ... ββββ +/// βββ΄ββ΄ββ β +/// β β β β β β β β ββββββββββββββββββββββββββ β β β β β β β β ββββββββββ β β β β β β β β β +/// Partition 1 β β Coalesce β βββ¬ββ¬ββ¬ββ¬ββ β β βββ¬ββ¬ββ¬ββ¬ββ +/// ββββΆ(no ordering guarantees)ββββΆββBβEβAβDβCβ...ββββΆ Sort βββββΆββAβBβCβDβEβ... β +/// β β β βββ΄ββ΄ββ΄ββ΄ββ β β βββ΄ββ΄ββ΄ββ΄ββ +/// β β β β β β β β ββββββββββββββββββββββββββ β β β β β β β β ββββββββββ β β β β β β β β β +/// βββ¬ββ β Partition Partition +/// ββEβCβ ... ββββ +/// βββ΄ββ +/// β β β β β β β +/// Partition 2 +/// ``` +/// +/// +/// The [`SortExec`] + [`SortPreservingMergeExec`] cascades +/// sorts each partition first, then merge partitions while retaining the sort: +/// ```text +/// β β β β β β β ββββββββββ β β β β β β β +/// βββ¬ββ¬ββ β β βββ¬ββ¬ββ +/// ββBβAβDβ... ββββΆβ Sort ββββΆββAβBβDβ... ββββ +/// βββ΄ββ΄ββ β β βββ΄ββ΄ββ β +/// β β β β β β β ββββββββββ β β β β β β β β βββββββββββββββββββββββ β β β β β β β β β +/// Partition 1 Partition 1 β β β βββ¬ββ¬ββ¬ββ¬ββ +/// ββββΆ SortPreservingMerge βββββΆββAβBβCβDβEβ... β +/// β β β βββ΄ββ΄ββ΄ββ΄ββ +/// β β β β β β β ββββββββββ β β β β β β β β βββββββββββββββββββββββ β β β β β β β β β +/// βββ¬ββ β β βββ¬ββ β Partition +/// ββEβCβ ... ββββΆβ Sort ββββΆββCβEβ ... ββββ +/// βββ΄ββ β β βββ΄ββ +/// β β β β β β β ββββββββββ β β β β β β β +/// Partition 2 Partition 2 +/// ``` +/// +/// The latter [`SortExec`] + [`SortPreservingMergeExec`] cascade performs the +/// sort first on a per-partition basis, thereby parallelizing the sort. +/// +/// +/// The outcome is that plans of the form Review Comment: nice -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org