On Thu, 13 Oct 2022 at 16:47, Richard Guo <guofengli...@gmail.com> wrote: > Currently in the patch the optimization is done before we check for > presorted paths or do the explicit sort of the cheapest path. How about > we move this optimization into the branch where we've found any > presorted paths? Maybe something like:
I've attached a patch to that effect, but it turns out a bit more complex than what you imagined. We still need to handle the case for when there's no path that has the required pathkeys and we must add a SortPath to the cheapest path. That requires adding some similar code to add the LimitPath after the foreach loop over the pathlist is over. I was also getting some weird plans like: regression=# explain select distinct on (four) four,hundred from tenk1 where four=0 order by 1,2; QUERY PLAN ---------------------------------------------------------------------- Sort (cost=0.20..0.20 rows=1 width=8) Sort Key: hundred -> Limit (cost=0.00..0.19 rows=1 width=8) -> Seq Scan on tenk1 (cost=0.00..470.00 rows=2500 width=8) Filter: (four = 0) To stop the planner from adding that final sort, I opted to hack the LimitPath's pathkeys to say that it's already sorted by the PlannerInfo's sort_pathkeys. That feels slightly icky, but it does seem a little wasteful to initialise a sort node on every execution of the plan to sort a single tuple. David
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 5d0fd6e072..5c4e4dc5cd 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4780,11 +4780,53 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, if (pathkeys_contained_in(needed_pathkeys, path->pathkeys)) { - add_path(distinct_rel, (Path *) - create_upper_unique_path(root, distinct_rel, - path, - list_length(root->distinct_pathkeys), - numDistinctRows)); + /* + * distinct_pathkeys may have become empty if all of the + * pathkeys were determined to be redundant. If all of the + * pathkeys are redundant then each DISTINCT target must only + * allow a single value, therefore all resulting tuples must + * be identical. We can uniquify these tuples simply by just + * taking the first tuple. All we do here is add a path to do + * LIMIT 1 on path. When doing DISTINCT ON we may still have + * a non-NIL sort_pathkeys list, so we must still only do this + * with paths which are contained in needed_pathkeys. + */ + if (root->distinct_pathkeys == NIL) + { + Node *limitCount; + LimitPath *lpath; + + limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, + sizeof(int64), + Int64GetDatum(1), false, + FLOAT8PASSBYVAL); + + /* + * If the query already has a LIMIT clause, then we could + * end up with a duplicate LimitPath in the final plan. + * That does not seem worth troubling over too much. + */ + lpath = create_limit_path(root, distinct_rel, path, NULL, + limitCount, LIMIT_OPTION_COUNT, + 0, 1); + + /* + * We set the LimitPath's pathkeys to the sort_pathkeys + * since there can only be a single row after the LIMIT to + * save the planner adding a useless SortPath for the + * ORDER BY. + */ + lpath->path.pathkeys = root->sort_pathkeys; + add_path(distinct_rel, (Path *) lpath); + } + else + { + add_path(distinct_rel, (Path *) + create_upper_unique_path(root, distinct_rel, + path, + list_length(root->distinct_pathkeys), + numDistinctRows)); + } } } @@ -4805,13 +4847,33 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, path = (Path *) create_sort_path(root, distinct_rel, path, needed_pathkeys, - -1.0); + root->distinct_pathkeys == NIL ? + 1.0 : -1.0); - add_path(distinct_rel, (Path *) - create_upper_unique_path(root, distinct_rel, - path, - list_length(root->distinct_pathkeys), - numDistinctRows)); + /* + * As above, use a LimitPath instead of a UniquePath when all of the + * distinct_pathkeys are redundant and we're only going to get a + * series of tuples all with the same values anyway. + */ + if (root->distinct_pathkeys == NIL) + { + Node *limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid, + sizeof(int64), + Int64GetDatum(1), false, + FLOAT8PASSBYVAL); + + add_path(distinct_rel, (Path *) + create_limit_path(root, distinct_rel, path, NULL, + limitCount, LIMIT_OPTION_COUNT, 0, 1)); + } + else + { + add_path(distinct_rel, (Path *) + create_upper_unique_path(root, distinct_rel, + path, + list_length(root->distinct_pathkeys), + numDistinctRows)); + } } /* diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index 748419cee0..6ce889d87c 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -279,6 +279,60 @@ RESET max_parallel_workers_per_gather; RESET min_parallel_table_scan_size; RESET parallel_setup_cost; RESET parallel_tuple_cost; +-- +-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when +-- all of the distinct_pathkeys have been marked as redundant +-- +-- Ensure we get a plan with a Limit 1 +EXPLAIN (COSTS OFF) +SELECT DISTINCT four FROM tenk1 WHERE four = 0; + QUERY PLAN +---------------------------- + Limit + -> Seq Scan on tenk1 + Filter: (four = 0) +(3 rows) + +-- Ensure the above gives us the correct result +SELECT DISTINCT four FROM tenk1 WHERE four = 0; + four +------ + 0 +(1 row) + +-- Ensure we get a plan with a Limit 1 +EXPLAIN (COSTS OFF) +SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0; + QUERY PLAN +--------------------------------------------- + Limit + -> Seq Scan on tenk1 + Filter: ((two <> 0) AND (four = 0)) +(3 rows) + +-- Ensure no rows are returned +SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0; + four +------ +(0 rows) + +-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants +EXPLAIN (COSTS OFF) +SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0; + QUERY PLAN +---------------------------- + Limit + -> Seq Scan on tenk1 + Filter: (four = 0) +(3 rows) + +-- Ensure we only get 1 row +SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0; + four | ?column? | ?column? | ?column? +------+----------+----------+---------- + 0 | 1 | 2 | 3 +(1 row) + -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file. diff --git a/src/test/regress/expected/select_distinct_on.out b/src/test/regress/expected/select_distinct_on.out index 3d98990991..b2978c1114 100644 --- a/src/test/regress/expected/select_distinct_on.out +++ b/src/test/regress/expected/select_distinct_on.out @@ -73,3 +73,53 @@ select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2; 0 | -2147483647 (1 row) +-- +-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when +-- all of the distinct_pathkeys have been marked as redundant +-- +-- Ensure we also get a LIMIT plan with DISTINCT ON +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (four) four,two + FROM tenk1 WHERE four = 0 ORDER BY 1; + QUERY PLAN +---------------------------------- + Result + -> Limit + -> Seq Scan on tenk1 + Filter: (four = 0) +(4 rows) + +-- and check the result of the above query is correct +SELECT DISTINCT ON (four) four,two + FROM tenk1 WHERE four = 0 ORDER BY 1; + four | two +------+----- + 0 | 0 +(1 row) + +-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (four) four,two + FROM tenk1 WHERE four = 0 ORDER BY 1,2; + QUERY PLAN +---------------------------------- + Limit + -> Sort + Sort Key: two + -> Seq Scan on tenk1 + Filter: (four = 0) +(5 rows) + +-- Same again but use a column that is indexed so that we get an index scan +-- then a limit +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (four) four,hundred + FROM tenk1 WHERE four = 0 ORDER BY 1,2; + QUERY PLAN +----------------------------------------------------- + Result + -> Limit + -> Index Scan using tenk1_hundred on tenk1 + Filter: (four = 0) +(4 rows) + diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index f27ff714f8..34020adad1 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -146,6 +146,32 @@ RESET min_parallel_table_scan_size; RESET parallel_setup_cost; RESET parallel_tuple_cost; +-- +-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when +-- all of the distinct_pathkeys have been marked as redundant +-- + +-- Ensure we get a plan with a Limit 1 +EXPLAIN (COSTS OFF) +SELECT DISTINCT four FROM tenk1 WHERE four = 0; + +-- Ensure the above gives us the correct result +SELECT DISTINCT four FROM tenk1 WHERE four = 0; + +-- Ensure we get a plan with a Limit 1 +EXPLAIN (COSTS OFF) +SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0; + +-- Ensure no rows are returned +SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0; + +-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants +EXPLAIN (COSTS OFF) +SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0; + +-- Ensure we only get 1 row +SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0; + -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file. diff --git a/src/test/regress/sql/select_distinct_on.sql b/src/test/regress/sql/select_distinct_on.sql index 0920bd64b9..261e6140fe 100644 --- a/src/test/regress/sql/select_distinct_on.sql +++ b/src/test/regress/sql/select_distinct_on.sql @@ -17,3 +17,28 @@ SELECT DISTINCT ON (string4, ten) string4, ten, two -- bug #5049: early 8.4.x chokes on volatile DISTINCT ON clauses select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2; + +-- +-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when +-- all of the distinct_pathkeys have been marked as redundant +-- + +-- Ensure we also get a LIMIT plan with DISTINCT ON +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (four) four,two + FROM tenk1 WHERE four = 0 ORDER BY 1; + +-- and check the result of the above query is correct +SELECT DISTINCT ON (four) four,two + FROM tenk1 WHERE four = 0 ORDER BY 1; + +-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (four) four,two + FROM tenk1 WHERE four = 0 ORDER BY 1,2; + +-- Same again but use a column that is indexed so that we get an index scan +-- then a limit +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (four) four,hundred + FROM tenk1 WHERE four = 0 ORDER BY 1,2;