While working on the regression tests added in a14a58329, I noticed that DISTINCT does not make use of Incremental Sort. It'll only ever do full sorts on the cheapest input path or make use of a path that's already got the required pathkeys. Also, I see that create_final_distinct_paths() is a little quirky and if the cheapest input path happens to be sorted, it'll add_path() the same path twice, which seems like a bit of a waste of effort. That could happen if say enable_seqscan is off or if a Merge Join is the cheapest join method.
Additionally, the parallel DISTINCT code looks like it should also get the same treatment. I see that I'd coded this to only add a unique path atop of a presorted path and it never considers sorting the cheapest partial path. I've adjusted that in the attached and also made it consider incremental sorting any path with presorted keys. Please see the attached patch. David
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 000d757bdd..c908944071 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4654,22 +4654,63 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, cheapest_partial_path->rows, NULL, NULL); - /* first try adding unique paths atop of sorted paths */ + /* + * First try sorting the cheapest path and incrementally sorting any paths + * with presorted keys and put a unique paths atop of those. + */ if (grouping_is_sortable(parse->distinctClause)) { foreach(lc, input_rel->partial_pathlist) { - Path *path = (Path *) lfirst(lc); + Path *input_path = (Path *) lfirst(lc); + Path *sorted_path; + bool is_sorted; + int presorted_keys; - if (pathkeys_contained_in(root->distinct_pathkeys, path->pathkeys)) + is_sorted = pathkeys_count_contained_in(root->distinct_pathkeys, + input_path->pathkeys, + &presorted_keys); + + if (is_sorted) + sorted_path = input_path; + else { - add_partial_path(partial_distinct_rel, (Path *) - create_upper_unique_path(root, - partial_distinct_rel, - path, - list_length(root->distinct_pathkeys), - numDistinctRows)); + /* + * Try at least sorting the cheapest path and also try + * incrementally sorting any path which is partially sorted + * already (no need to deal with paths which have presorted keys + * when incremental sort is disabled unless it's the cheapest + * partial path). + */ + if (input_path != cheapest_partial_path && + (presorted_keys == 0 || !enable_incremental_sort)) + continue; + + /* + * We've no need to consider both a sort and incremental sort. + * We'll just do a sort if there are no presorted keys and an + * incremental sort when there are presorted keys. + */ + if (presorted_keys == 0 || !enable_incremental_sort) + sorted_path = (Path *) create_sort_path(root, + partial_distinct_rel, + input_path, + root->distinct_pathkeys, + -1.0); + else + sorted_path = (Path *) create_incremental_sort_path(root, + partial_distinct_rel, + input_path, + root->distinct_pathkeys, + presorted_keys, + -1.0); } + + add_partial_path(partial_distinct_rel, (Path *) + create_upper_unique_path(root, partial_distinct_rel, + sorted_path, + list_length(root->distinct_pathkeys), + numDistinctRows)); } } @@ -4785,8 +4826,8 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, * the other.) */ List *needed_pathkeys; - Path *path; ListCell *lc; + double limittuples = root->distinct_pathkeys == NIL ? 1.0 : -1.0; if (parse->hasDistinctOn && list_length(root->distinct_pathkeys) < @@ -4797,96 +4838,89 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, foreach(lc, input_rel->pathlist) { - path = (Path *) lfirst(lc); + Path *input_path = (Path *) lfirst(lc); + Path *sorted_path; + bool is_sorted; + int presorted_keys; - if (pathkeys_contained_in(needed_pathkeys, path->pathkeys)) + is_sorted = pathkeys_count_contained_in(needed_pathkeys, + input_path->pathkeys, + &presorted_keys); + + if (is_sorted) + sorted_path = input_path; + else { /* - * distinct_pathkeys may have become empty if all of the - * pathkeys were determined to be redundant. If all of the - * pathkeys are redundant then each DISTINCT target must only - * allow a single value, therefore all resulting tuples must - * be identical (or at least indistinguishable by an equality - * check). We can uniquify these tuples simply by just taking - * the first tuple. All we do here is add a path to do "LIMIT - * 1" atop of 'path'. When doing a DISTINCT ON we may still - * have a non-NIL sort_pathkeys list, so we must still only do - * this with paths which are correctly sorted by - * sort_pathkeys. + * Try at least sorting the cheapest path and also try + * incrementally sorting any path which is partially sorted + * already (no need to deal with paths which have presorted keys + * when incremental sort is disabled unless it's the cheapest + * input path). */ - if (root->distinct_pathkeys == NIL) - { - Node *limitCount; - - limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, - sizeof(int64), - Int64GetDatum(1), false, - FLOAT8PASSBYVAL); + if (input_path != cheapest_input_path && + (presorted_keys == 0 || !enable_incremental_sort)) + continue; - /* - * If the query already has a LIMIT clause, then we could - * end up with a duplicate LimitPath in the final plan. - * That does not seem worth troubling over too much. - */ - add_path(distinct_rel, (Path *) - create_limit_path(root, distinct_rel, path, NULL, - limitCount, LIMIT_OPTION_COUNT, - 0, 1)); - } + /* + * We've no need to consider both a sort and incremental sort. + * We'll just do a sort if there are no presorted keys and an + * incremental sort when there are presorted keys. + */ + if (presorted_keys == 0 || !enable_incremental_sort) + sorted_path = (Path *) create_sort_path(root, + distinct_rel, + input_path, + needed_pathkeys, + limittuples); else - { - add_path(distinct_rel, (Path *) - create_upper_unique_path(root, distinct_rel, - path, - list_length(root->distinct_pathkeys), - numDistinctRows)); - } + sorted_path = (Path *) create_incremental_sort_path(root, + distinct_rel, + input_path, + needed_pathkeys, + presorted_keys, + limittuples); } - } - /* For explicit-sort case, always use the more rigorous clause */ - if (list_length(root->distinct_pathkeys) < - list_length(root->sort_pathkeys)) - { - needed_pathkeys = root->sort_pathkeys; - /* Assert checks that parser didn't mess up... */ - Assert(pathkeys_contained_in(root->distinct_pathkeys, - needed_pathkeys)); - } - else - needed_pathkeys = root->distinct_pathkeys; + /* + * distinct_pathkeys may have become empty if all of the pathkeys + * were determined to be redundant. If all of the pathkeys are + * redundant then each DISTINCT target must only allow a single + * value, therefore all resulting tuples must be identical (or at + * least indistinguishable by an equality check). We can uniquify + * these tuples simply by just taking the first tuple. All we do + * here is add a path to do "LIMIT 1" atop of 'sorted_path'. When + * doing a DISTINCT ON we may still have a non-NIL sort_pathkeys + * list, so we must still only do this with paths which are + * correctly sorted by sort_pathkeys. + */ + if (root->distinct_pathkeys == NIL) + { + Node *limitCount; - path = cheapest_input_path; - if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys)) - path = (Path *) create_sort_path(root, distinct_rel, - path, - needed_pathkeys, - root->distinct_pathkeys == NIL ? - 1.0 : -1.0); + limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, + sizeof(int64), + Int64GetDatum(1), false, + FLOAT8PASSBYVAL); - /* - * As above, use a LimitPath instead of a UniquePath when all of the - * distinct_pathkeys are redundant and we're only going to get a - * series of tuples all with the same values anyway. - */ - if (root->distinct_pathkeys == NIL) - { - Node *limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, - sizeof(int64), - Int64GetDatum(1), false, - FLOAT8PASSBYVAL); - - add_path(distinct_rel, (Path *) - create_limit_path(root, distinct_rel, path, NULL, - limitCount, LIMIT_OPTION_COUNT, 0, 1)); - } - else - { - add_path(distinct_rel, (Path *) - create_upper_unique_path(root, distinct_rel, - path, - list_length(root->distinct_pathkeys), - numDistinctRows)); + /* + * If the query already has a LIMIT clause, then we could + * end up with a duplicate LimitPath in the final plan. + * That does not seem worth troubling over too much. + */ + add_path(distinct_rel, (Path *) + create_limit_path(root, distinct_rel, sorted_path, + NULL, limitCount, + LIMIT_OPTION_COUNT, 0, 1)); + } + else + { + add_path(distinct_rel, (Path *) + create_upper_unique_path(root, distinct_rel, + sorted_path, + list_length(root->distinct_pathkeys), + numDistinctRows)); + } } } diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index 1a1e8b2365..0c3433f8e5 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1484,15 +1484,16 @@ explain (costs off) select * from t union select * from t order by 1,3; -- Full sort, not just incremental sort can be pushed below a gather merge path -- by generate_useful_gather_paths. explain (costs off) select distinct a,b from t; - QUERY PLAN ------------------------------------------- + QUERY PLAN +------------------------------------------------ Unique -> Gather Merge Workers Planned: 2 - -> Sort - Sort Key: a, b - -> Parallel Seq Scan on t -(6 rows) + -> Unique + -> Sort + Sort Key: a, b + -> Parallel Seq Scan on t +(7 rows) drop table t; -- Sort pushdown can't go below where expressions are part of the rel target. diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index 6ce889d87c..1fc07f220f 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -171,6 +171,20 @@ SELECT DISTINCT g%1000 FROM generate_series(0,9999) g; SET jit_above_cost TO DEFAULT; CREATE TABLE distinct_group_2 AS SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g; +SET enable_seqscan = 0; +-- Check to see we get an incremental sort plan +EXPLAIN (costs off) +SELECT DISTINCT hundred, two FROM tenk1; + QUERY PLAN +----------------------------------------------------- + Unique + -> Incremental Sort + Sort Key: hundred, two + Presorted Key: hundred + -> Index Scan using tenk1_hundred on tenk1 +(5 rows) + +RESET enable_seqscan; SET enable_hashagg=TRUE; -- Produce results with hash aggregation. SET enable_sort=FALSE; @@ -265,15 +279,16 @@ $$ LANGUAGE plpgsql PARALLEL SAFE; -- Ensure we do parallel distinct now that the function is parallel safe EXPLAIN (COSTS OFF) SELECT DISTINCT distinct_func(1) FROM tenk1; - QUERY PLAN ----------------------------------------------- + QUERY PLAN +---------------------------------------------------- Unique - -> Sort - Sort Key: (distinct_func(1)) - -> Gather - Workers Planned: 2 - -> Parallel Seq Scan on tenk1 -(6 rows) + -> Gather Merge + Workers Planned: 2 + -> Unique + -> Sort + Sort Key: (distinct_func(1)) + -> Parallel Seq Scan on tenk1 +(7 rows) RESET max_parallel_workers_per_gather; RESET min_parallel_table_scan_size; diff --git a/src/test/regress/expected/window.out b/src/test/regress/expected/window.out index 90e89fb5b6..b2c6605e60 100644 --- a/src/test/regress/expected/window.out +++ b/src/test/regress/expected/window.out @@ -3944,8 +3944,9 @@ ORDER BY depname, enroll_date; QUERY PLAN ----------------------------------------------------------------------------------------------- Unique - -> Sort + -> Incremental Sort Sort Key: depname, enroll_date, empno, (sum(salary) OVER (?)), (min(salary) OVER (?)) + Presorted Key: depname, enroll_date -> WindowAgg -> Incremental Sort Sort Key: depname, enroll_date @@ -3954,7 +3955,7 @@ ORDER BY depname, enroll_date; -> Sort Sort Key: depname, empno -> Seq Scan on empsalary -(11 rows) +(12 rows) -- As above but adjust the ORDER BY clause to help ensure the plan with the -- minimum amount of sorting wasn't a fluke. @@ -3970,8 +3971,9 @@ ORDER BY depname, empno; QUERY PLAN ----------------------------------------------------------------------------------------------- Unique - -> Sort + -> Incremental Sort Sort Key: depname, empno, enroll_date, (sum(salary) OVER (?)), (min(salary) OVER (?)) + Presorted Key: depname, empno -> WindowAgg -> Incremental Sort Sort Key: depname, empno @@ -3980,7 +3982,7 @@ ORDER BY depname, empno; -> Sort Sort Key: depname, enroll_date -> Seq Scan on empsalary -(11 rows) +(12 rows) RESET enable_hashagg; -- Test Sort node reordering diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index 34020adad1..1643526d99 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -69,6 +69,14 @@ SET jit_above_cost TO DEFAULT; CREATE TABLE distinct_group_2 AS SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g; +SET enable_seqscan = 0; + +-- Check to see we get an incremental sort plan +EXPLAIN (costs off) +SELECT DISTINCT hundred, two FROM tenk1; + +RESET enable_seqscan; + SET enable_hashagg=TRUE; -- Produce results with hash aggregation.