Allow DISTINCT to use Incremental Sort

David Rowley Sat, 07 Jan 2023 01:47:18 -0800

While working on the regression tests added in a14a58329, I noticed
that DISTINCT does not make use of Incremental Sort.  It'll only ever
do full sorts on the cheapest input path or make use of a path that's
already got the required pathkeys.  Also, I see that
create_final_distinct_paths() is a little quirky and if the cheapest
input path happens to be sorted, it'll add_path() the same path twice,
which seems like a bit of a waste of effort. That could happen if say
enable_seqscan is off or if a Merge Join is the cheapest join method.


Additionally, the parallel DISTINCT code looks like it should also get
the same treatment.  I see that I'd coded this to only add a unique
path atop of a presorted path and it never considers sorting the
cheapest partial path.  I've adjusted that in the attached and also
made it consider incremental sorting any path with presorted keys.

Please see the attached patch.

David

diff --git a/src/backend/optimizer/plan/planner.c 
b/src/backend/optimizer/plan/planner.c
index 000d757bdd..c908944071 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4654,22 +4654,63 @@ create_partial_distinct_paths(PlannerInfo *root, 
RelOptInfo *input_rel,
                                                                                
  cheapest_partial_path->rows,
                                                                                
  NULL, NULL);
 
-       /* first try adding unique paths atop of sorted paths */
+       /*
+        * First try sorting the cheapest path and incrementally sorting any 
paths
+        * with presorted keys and put a unique paths atop of those.
+        */
        if (grouping_is_sortable(parse->distinctClause))
        {
                foreach(lc, input_rel->partial_pathlist)
                {
-                       Path       *path = (Path *) lfirst(lc);
+                       Path       *input_path = (Path *) lfirst(lc);
+                       Path       *sorted_path;
+                       bool            is_sorted;
+                       int                     presorted_keys;
 
-                       if (pathkeys_contained_in(root->distinct_pathkeys, 
path->pathkeys))
+                       is_sorted = 
pathkeys_count_contained_in(root->distinct_pathkeys,
+                                                                               
                        input_path->pathkeys,
+                                                                               
                        &presorted_keys);
+
+                       if (is_sorted)
+                               sorted_path = input_path;
+                       else
                        {
-                               add_partial_path(partial_distinct_rel, (Path *)
-                                                                
create_upper_unique_path(root,
-                                                                               
                                  partial_distinct_rel,
-                                                                               
                                  path,
-                                                                               
                                  list_length(root->distinct_pathkeys),
-                                                                               
                                  numDistinctRows));
+                               /*
+                                * Try at least sorting the cheapest path and 
also try
+                                * incrementally sorting any path which is 
partially sorted
+                                * already (no need to deal with paths which 
have presorted keys
+                                * when incremental sort is disabled unless 
it's the cheapest
+                                * partial path).
+                                */
+                               if (input_path != cheapest_partial_path &&
+                                       (presorted_keys == 0 || 
!enable_incremental_sort))
+                                       continue;
+
+                               /*
+                                * We've no need to consider both a sort and 
incremental sort.
+                                * We'll just do a sort if there are no 
presorted keys and an
+                                * incremental sort when there are presorted 
keys.
+                                */
+                               if (presorted_keys == 0 || 
!enable_incremental_sort)
+                                       sorted_path = (Path *) 
create_sort_path(root,
+                                                                               
                                        partial_distinct_rel,
+                                                                               
                                        input_path,
+                                                                               
                                        root->distinct_pathkeys,
+                                                                               
                                        -1.0);
+                               else
+                                       sorted_path = (Path *) 
create_incremental_sort_path(root,
+                                                                               
                                                                
partial_distinct_rel,
+                                                                               
                                                                input_path,
+                                                                               
                                                                
root->distinct_pathkeys,
+                                                                               
                                                                presorted_keys,
+                                                                               
                                                                -1.0);
                        }
+
+                       add_partial_path(partial_distinct_rel, (Path *)
+                                                        
create_upper_unique_path(root, partial_distinct_rel,
+                                                                               
                          sorted_path,
+                                                                               
                          list_length(root->distinct_pathkeys),
+                                                                               
                          numDistinctRows));
                }
        }
 
@@ -4785,8 +4826,8 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo 
*input_rel,
                 * the other.)
                 */
                List       *needed_pathkeys;
-               Path       *path;
                ListCell   *lc;
+               double          limittuples = root->distinct_pathkeys == NIL ? 
1.0 : -1.0;
 
                if (parse->hasDistinctOn &&
                        list_length(root->distinct_pathkeys) <
@@ -4797,96 +4838,89 @@ create_final_distinct_paths(PlannerInfo *root, 
RelOptInfo *input_rel,
 
                foreach(lc, input_rel->pathlist)
                {
-                       path = (Path *) lfirst(lc);
+                       Path       *input_path = (Path *) lfirst(lc);
+                       Path       *sorted_path;
+                       bool            is_sorted;
+                       int                     presorted_keys;
 
-                       if (pathkeys_contained_in(needed_pathkeys, 
path->pathkeys))
+                       is_sorted = pathkeys_count_contained_in(needed_pathkeys,
+                                                                               
                        input_path->pathkeys,
+                                                                               
                        &presorted_keys);
+
+                       if (is_sorted)
+                               sorted_path = input_path;
+                       else
                        {
                                /*
-                                * distinct_pathkeys may have become empty if 
all of the
-                                * pathkeys were determined to be redundant.  
If all of the
-                                * pathkeys are redundant then each DISTINCT 
target must only
-                                * allow a single value, therefore all 
resulting tuples must
-                                * be identical (or at least indistinguishable 
by an equality
-                                * check).  We can uniquify these tuples simply 
by just taking
-                                * the first tuple.  All we do here is add a 
path to do "LIMIT
-                                * 1" atop of 'path'.  When doing a DISTINCT ON 
we may still
-                                * have a non-NIL sort_pathkeys list, so we 
must still only do
-                                * this with paths which are correctly sorted by
-                                * sort_pathkeys.
+                                * Try at least sorting the cheapest path and 
also try
+                                * incrementally sorting any path which is 
partially sorted
+                                * already (no need to deal with paths which 
have presorted keys
+                                * when incremental sort is disabled unless 
it's the cheapest
+                                * input path).
                                 */
-                               if (root->distinct_pathkeys == NIL)
-                               {
-                                       Node       *limitCount;
-
-                                       limitCount = (Node *) 
makeConst(INT8OID, -1, InvalidOid,
-                                                                               
                        sizeof(int64),
-                                                                               
                        Int64GetDatum(1), false,
-                                                                               
                        FLOAT8PASSBYVAL);
+                               if (input_path != cheapest_input_path &&
+                                       (presorted_keys == 0 || 
!enable_incremental_sort))
+                                       continue;
 
-                                       /*
-                                        * If the query already has a LIMIT 
clause, then we could
-                                        * end up with a duplicate LimitPath in 
the final plan.
-                                        * That does not seem worth troubling 
over too much.
-                                        */
-                                       add_path(distinct_rel, (Path *)
-                                                        
create_limit_path(root, distinct_rel, path, NULL,
-                                                                               
           limitCount, LIMIT_OPTION_COUNT,
-                                                                               
           0, 1));
-                               }
+                               /*
+                                * We've no need to consider both a sort and 
incremental sort.
+                                * We'll just do a sort if there are no 
presorted keys and an
+                                * incremental sort when there are presorted 
keys.
+                                */
+                               if (presorted_keys == 0 || 
!enable_incremental_sort)
+                                       sorted_path = (Path *) 
create_sort_path(root,
+                                                                               
                                        distinct_rel,
+                                                                               
                                        input_path,
+                                                                               
                                        needed_pathkeys,
+                                                                               
                                        limittuples);
                                else
-                               {
-                                       add_path(distinct_rel, (Path *)
-                                                        
create_upper_unique_path(root, distinct_rel,
-                                                                               
                          path,
-                                                                               
                          list_length(root->distinct_pathkeys),
-                                                                               
                          numDistinctRows));
-                               }
+                                       sorted_path = (Path *) 
create_incremental_sort_path(root,
+                                                                               
                                                                distinct_rel,
+                                                                               
                                                                input_path,
+                                                                               
                                                                needed_pathkeys,
+                                                                               
                                                                presorted_keys,
+                                                                               
                                                                limittuples);
                        }
-               }
 
-               /* For explicit-sort case, always use the more rigorous clause 
*/
-               if (list_length(root->distinct_pathkeys) <
-                       list_length(root->sort_pathkeys))
-               {
-                       needed_pathkeys = root->sort_pathkeys;
-                       /* Assert checks that parser didn't mess up... */
-                       Assert(pathkeys_contained_in(root->distinct_pathkeys,
-                                                                               
 needed_pathkeys));
-               }
-               else
-                       needed_pathkeys = root->distinct_pathkeys;
+                       /*
+                        * distinct_pathkeys may have become empty if all of 
the pathkeys
+                        * were determined to be redundant.  If all of the 
pathkeys are
+                        * redundant then each DISTINCT target must only allow 
a single
+                        * value, therefore all resulting tuples must be 
identical (or at
+                        * least indistinguishable by an equality check).  We 
can uniquify
+                        * these tuples simply by just taking the first tuple.  
All we do
+                        * here is add a path to do "LIMIT 1" atop of 
'sorted_path'.  When
+                        * doing a DISTINCT ON we may still have a non-NIL 
sort_pathkeys
+                        * list, so we must still only do this with paths which 
are
+                        * correctly sorted by sort_pathkeys.
+                        */
+                       if (root->distinct_pathkeys == NIL)
+                       {
+                               Node       *limitCount;
 
-               path = cheapest_input_path;
-               if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-                       path = (Path *) create_sort_path(root, distinct_rel,
-                                                                               
         path,
-                                                                               
         needed_pathkeys,
-                                                                               
         root->distinct_pathkeys == NIL ?
-                                                                               
         1.0 : -1.0);
+                               limitCount = (Node *) makeConst(INT8OID, -1, 
InvalidOid,
+                                                                               
                sizeof(int64),
+                                                                               
                Int64GetDatum(1), false,
+                                                                               
                FLOAT8PASSBYVAL);
 
-               /*
-                * As above, use a LimitPath instead of a UniquePath when all 
of the
-                * distinct_pathkeys are redundant and we're only going to get a
-                * series of tuples all with the same values anyway.
-                */
-               if (root->distinct_pathkeys == NIL)
-               {
-                       Node       *limitCount = (Node *) makeConst(INT8OID, 
-1, InvalidOid,
-                                                                               
                                sizeof(int64),
-                                                                               
                                Int64GetDatum(1), false,
-                                                                               
                                FLOAT8PASSBYVAL);
-
-                       add_path(distinct_rel, (Path *)
-                                        create_limit_path(root, distinct_rel, 
path, NULL,
-                                                                          
limitCount, LIMIT_OPTION_COUNT, 0, 1));
-               }
-               else
-               {
-                       add_path(distinct_rel, (Path *)
-                                        create_upper_unique_path(root, 
distinct_rel,
-                                                                               
          path,
-                                                                               
          list_length(root->distinct_pathkeys),
-                                                                               
          numDistinctRows));
+                               /*
+                                * If the query already has a LIMIT clause, 
then we could
+                                * end up with a duplicate LimitPath in the 
final plan.
+                                * That does not seem worth troubling over too 
much.
+                                */
+                               add_path(distinct_rel, (Path *)
+                                                create_limit_path(root, 
distinct_rel, sorted_path,
+                                                                               
   NULL, limitCount,
+                                                                               
   LIMIT_OPTION_COUNT, 0, 1));
+                       }
+                       else
+                       {
+                               add_path(distinct_rel, (Path *)
+                                                create_upper_unique_path(root, 
distinct_rel,
+                                                                               
                  sorted_path,
+                                                                               
                  list_length(root->distinct_pathkeys),
+                                                                               
                  numDistinctRows));
+                       }
                }
        }
 
diff --git a/src/test/regress/expected/incremental_sort.out 
b/src/test/regress/expected/incremental_sort.out
index 1a1e8b2365..0c3433f8e5 100644
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1484,15 +1484,16 @@ explain (costs off) select * from t union select * from 
t order by 1,3;
 -- Full sort, not just incremental sort can be pushed below a gather merge path
 -- by generate_useful_gather_paths.
 explain (costs off) select distinct a,b from t;
-                QUERY PLAN                
-------------------------------------------
+                   QUERY PLAN                   
+------------------------------------------------
  Unique
    ->  Gather Merge
          Workers Planned: 2
-         ->  Sort
-               Sort Key: a, b
-               ->  Parallel Seq Scan on t
-(6 rows)
+         ->  Unique
+               ->  Sort
+                     Sort Key: a, b
+                     ->  Parallel Seq Scan on t
+(7 rows)
 
 drop table t;
 -- Sort pushdown can't go below where expressions are part of the rel target.
diff --git a/src/test/regress/expected/select_distinct.out 
b/src/test/regress/expected/select_distinct.out
index 6ce889d87c..1fc07f220f 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -171,6 +171,20 @@ SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
 SET jit_above_cost TO DEFAULT;
 CREATE TABLE distinct_group_2 AS
 SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
+SET enable_seqscan = 0;
+-- Check to see we get an incremental sort plan
+EXPLAIN (costs off)
+SELECT DISTINCT hundred, two FROM tenk1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Unique
+   ->  Incremental Sort
+         Sort Key: hundred, two
+         Presorted Key: hundred
+         ->  Index Scan using tenk1_hundred on tenk1
+(5 rows)
+
+RESET enable_seqscan;
 SET enable_hashagg=TRUE;
 -- Produce results with hash aggregation.
 SET enable_sort=FALSE;
@@ -265,15 +279,16 @@ $$ LANGUAGE plpgsql PARALLEL SAFE;
 -- Ensure we do parallel distinct now that the function is parallel safe
 EXPLAIN (COSTS OFF)
 SELECT DISTINCT distinct_func(1) FROM tenk1;
-                  QUERY PLAN                  
-----------------------------------------------
+                     QUERY PLAN                     
+----------------------------------------------------
  Unique
-   ->  Sort
-         Sort Key: (distinct_func(1))
-         ->  Gather
-               Workers Planned: 2
-               ->  Parallel Seq Scan on tenk1
-(6 rows)
+   ->  Gather Merge
+         Workers Planned: 2
+         ->  Unique
+               ->  Sort
+                     Sort Key: (distinct_func(1))
+                     ->  Parallel Seq Scan on tenk1
+(7 rows)
 
 RESET max_parallel_workers_per_gather;
 RESET min_parallel_table_scan_size;
diff --git a/src/test/regress/expected/window.out 
b/src/test/regress/expected/window.out
index 90e89fb5b6..b2c6605e60 100644
--- a/src/test/regress/expected/window.out
+++ b/src/test/regress/expected/window.out
@@ -3944,8 +3944,9 @@ ORDER BY depname, enroll_date;
                                           QUERY PLAN                           
                
 
-----------------------------------------------------------------------------------------------
  Unique
-   ->  Sort
+   ->  Incremental Sort
          Sort Key: depname, enroll_date, empno, (sum(salary) OVER (?)), 
(min(salary) OVER (?))
+         Presorted Key: depname, enroll_date
          ->  WindowAgg
                ->  Incremental Sort
                      Sort Key: depname, enroll_date
@@ -3954,7 +3955,7 @@ ORDER BY depname, enroll_date;
                            ->  Sort
                                  Sort Key: depname, empno
                                  ->  Seq Scan on empsalary
-(11 rows)
+(12 rows)
 
 -- As above but adjust the ORDER BY clause to help ensure the plan with the
 -- minimum amount of sorting wasn't a fluke.
@@ -3970,8 +3971,9 @@ ORDER BY depname, empno;
                                           QUERY PLAN                           
                
 
-----------------------------------------------------------------------------------------------
  Unique
-   ->  Sort
+   ->  Incremental Sort
          Sort Key: depname, empno, enroll_date, (sum(salary) OVER (?)), 
(min(salary) OVER (?))
+         Presorted Key: depname, empno
          ->  WindowAgg
                ->  Incremental Sort
                      Sort Key: depname, empno
@@ -3980,7 +3982,7 @@ ORDER BY depname, empno;
                            ->  Sort
                                  Sort Key: depname, enroll_date
                                  ->  Seq Scan on empsalary
-(11 rows)
+(12 rows)
 
 RESET enable_hashagg;
 -- Test Sort node reordering
diff --git a/src/test/regress/sql/select_distinct.sql 
b/src/test/regress/sql/select_distinct.sql
index 34020adad1..1643526d99 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -69,6 +69,14 @@ SET jit_above_cost TO DEFAULT;
 CREATE TABLE distinct_group_2 AS
 SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
 
+SET enable_seqscan = 0;
+
+-- Check to see we get an incremental sort plan
+EXPLAIN (costs off)
+SELECT DISTINCT hundred, two FROM tenk1;
+
+RESET enable_seqscan;
+
 SET enable_hashagg=TRUE;
 
 -- Produce results with hash aggregation.

Allow DISTINCT to use Incremental Sort

Reply via email to