Hi all! I think I may have stumbled across a case of wrong results on HEAD (same through version 9.6, though interestingly 9.5 produces different results altogether).
test=# SELECT i AS ai1, i AS ai2 FROM generate_series(1,3)i GROUP BY ai2, ROLLUP(ai1) ORDER BY ai1, ai2; ai1 | ai2 -----+----- 1 | 1 | 1 2 | 2 | 2 3 | 3 | 3 (6 rows) I had expected: ai1 | ai2 -----+----- 1 | 1 2 | 2 3 | 3 | 1 | 2 | 3 (6 rows) It seems to me that the plan is missing a Sort node (on ai1 and ai2) above the Aggregate node. QUERY PLAN ------------------------------------------------ GroupAggregate Group Key: i, i Group Key: i -> Sort Sort Key: i -> Function Scan on generate_series i I have a hunch part of the issue may be an assumption that a duplicate aliased column will produce the same column values and hence isn't included in the range table, nor subsequently the pathkeys. However, that assumption does not seem to be true for queries with multiple grouping set specifications: test=# SELECT i as ai1, i as ai2 FROM (values (1),(2),(3)) v(i) GROUP BY ai1, ROLLUP(ai2); ai1 | ai2 -----+----- 1 | 1 2 | 2 3 | 3 1 | 2 | 3 | (6 rows) It seems to me that excluding the duplicate alias from the pathkeys is leading to a case where the group order is incorrectly determined to satisfy the sort order. Thus create_ordered_paths() decides against applying an explicit sort node. But simply forcing an explicit sort still seems wrong since we've effectively lost a relevant column for the sort. I tinkered a bit and hacked together an admittedly ugly patch that triggers an explicit sort constructed from the parse tree. An alternative approach I had considered was to update the rewriteHandler to explicitly force the existence of the duplicate alias column in the range tables. But that also felt meh. Does this seem like a legit issue? And if so, any thoughts on alternative approaches? Thanks, David Kimura
commit 7e7b9a0dce1ba60f4fc087082f04b04e7f405539 Author: David Kimura <dkimura@vmware.com> Date: Wed Oct 19 21:32:27 2022 +0000 Fix multiple grouping specs using duplicate alias bug There seems to have been an assumption that multiple aliases that refer to the same column would always produce mirrored results. However, that does not seem to be the case when the aliased columns are passed through different grouping set specs. Consider the case: SELECT i AS alias1, i AS alias2 FROM generate_series(1,3)i GROUP BY alias2, ROLLUP(alias1); This led to a scenario where a relevant column is collapsed at parse time and thus excluded from sort pathkeys. Consequently group order could then incorrectly decide it satisfied the sort order. Which leads to create_ordered_paths() forgoing an explicit sort node. A solution to the issue is to force an explicit sort and revive the relevant column info. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index ac86ce9003..0b81d4d211 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -2157,6 +2157,46 @@ change_plan_targetlist(Plan *subplan, List *tlist, bool tlist_parallel_safe) return subplan; } +/* + * reevaluate_sort_info + * + * Build the sort node info from the parse tree. + */ +static void +reevaluate_sort_info(Query *parse, Sort *plan) +{ + int i = 0; + ListCell *l1; + ListCell *l2; + + plan->numCols = list_length(parse->sortClause); + + plan->sortColIdx = palloc(sizeof(int) * plan->numCols); + plan->sortOperators = palloc(sizeof(int) * plan->numCols); + plan->collations = palloc(sizeof(int) * plan->numCols); + plan->nullsFirst = palloc(sizeof(int) * plan->numCols); + + foreach(l1, parse->sortClause) + { + foreach(l2, parse->targetList) + { + SortGroupClause *sortClause = (SortGroupClause *) lfirst(l1); + TargetEntry *targetEntry = (TargetEntry *) lfirst(l2); + + if (sortClause->tleSortGroupRef == targetEntry->ressortgroupref) + { + plan->sortColIdx[i] = targetEntry->resno; + plan->sortOperators[i] = sortClause->sortop; + plan->collations[i] = exprCollation((Node *) targetEntry->expr); + plan->nullsFirst[i] = sortClause->nulls_first; + + i++; + break; + } + } + } +} + /* * create_sort_plan * @@ -2187,6 +2227,9 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags) IS_OTHER_REL(best_path->subpath->parent) ? best_path->path.parent->relids : NULL); + if (best_path->need_evaluation) + reevaluate_sort_info(root->parse, plan); + copy_generic_path_info(&plan->plan, (Path *) best_path); return plan; @@ -2213,6 +2256,9 @@ create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path, best_path->spath.path.parent->relids : NULL, best_path->nPresortedCols); + if (best_path->spath.need_evaluation) + reevaluate_sort_info(root->parse, &plan->sort); + copy_generic_path_info(&plan->sort.plan, (Path *) best_path); return plan; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 5d0fd6e072..433fbddd06 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4852,6 +4852,60 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, return distinct_rel; } +/* + * path_is_sorted + * + * Check whether the Path satisfies sort requirement or needs evaulation. + * + * Under normal circumstances it is sufficient to simply check that the + * grouping path is at least as sorted as the sort path. However, this is not + * the case for multiple grouping sets using duplicate alias columns. The + * reason is because the parser removes the duplicate column from the projected + * range table targets, which in turn removes it from the group and sort paths. + * In that case, require an explict sort that needs evaulation. + */ +static bool +path_is_sorted(PlannerInfo *root, + Path *input_path, + int *presorted_keys, + bool *need_evaluation) +{ + *need_evaluation = false; + + if (IsA(input_path, GroupingSetsPath)) + { + ListCell *lc1, + *lc2, + *lc3; + GroupingSetsPath *gspath = (GroupingSetsPath *) input_path; + + if (list_length(gspath->rollups) > 1) + *need_evaluation = true; + + foreach(lc1, gspath->rollups) + { + List *groupClause = ((RollupData *) lfirst(lc1))->groupClause; + + if (list_length(groupClause) == list_length(root->parse->sortClause)) + { + forboth(lc2, groupClause, lc3, root->parse->sortClause) + { + if (!equal(lfirst(lc2), lfirst(lc3))) + { + *need_evaluation = true; + break; + } + } + } + } + } + + return pathkeys_count_contained_in(root->sort_pathkeys, + input_path->pathkeys, presorted_keys) && + !(*need_evaluation); + +} + /* * create_ordered_paths * @@ -4904,10 +4958,12 @@ create_ordered_paths(PlannerInfo *root, Path *input_path = (Path *) lfirst(lc); Path *sorted_path = input_path; bool is_sorted; + bool need_evaluation; int presorted_keys; - is_sorted = pathkeys_count_contained_in(root->sort_pathkeys, - input_path->pathkeys, &presorted_keys); + is_sorted = path_is_sorted(root, input_path, + &presorted_keys, + &need_evaluation); if (is_sorted) { @@ -4936,6 +4992,8 @@ create_ordered_paths(PlannerInfo *root, input_path, root->sort_pathkeys, limit_tuples); + ((SortPath *) sorted_path)->need_evaluation = need_evaluation; + /* Add projection step if needed */ if (sorted_path->pathtarget != target) sorted_path = apply_projection_to_path(root, ordered_rel, @@ -4965,6 +5023,7 @@ create_ordered_paths(PlannerInfo *root, root->sort_pathkeys, presorted_keys, limit_tuples); + ((IncrementalSortPath *) sorted_path)->spath.need_evaluation = need_evaluation; /* Add projection step if needed */ if (sorted_path->pathtarget != target) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 70f61ae7b1..1aed34f35c 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2932,6 +2932,7 @@ create_incremental_sort_path(PlannerInfo *root, pathnode->path.pathkeys = pathkeys; pathnode->subpath = subpath; + pathnode->need_evaluation = false; cost_incremental_sort(&pathnode->path, root, pathkeys, presorted_keys, @@ -2979,6 +2980,7 @@ create_sort_path(PlannerInfo *root, pathnode->path.pathkeys = pathkeys; pathnode->subpath = subpath; + pathnode->need_evaluation = false; cost_sort(&pathnode->path, root, pathkeys, subpath->total_cost, diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 6bda383bea..939711f777 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -2065,6 +2065,8 @@ typedef struct SortPath { Path path; Path *subpath; /* path representing input source */ + + bool need_evaluation; /* if sort must re-build from parse tree */ } SortPath; /* diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out index fcad5c4093..6a09232d94 100644 --- a/src/test/regress/expected/groupingsets.out +++ b/src/test/regress/expected/groupingsets.out @@ -2151,4 +2151,154 @@ select (select grouping(v1)) from (values ((select 1))) v(v1) group by v1; 0 (1 row) +-- test duplicate alias used in multiple grouping set specs and sort ordering +set enable_hashagg=off; +explain (costs off) +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1; + QUERY PLAN +----------------------------------------------------------- + Sort + Sort Key: "*VALUES*".column1, "*VALUES*".column1 + -> GroupAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column1 + Group Key: "*VALUES*".column1 + -> Sort + Sort Key: "*VALUES*".column1 + -> Values Scan on "*VALUES*" +(8 rows) + +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1; + ai1 | ai2 +-----+----- + 1 | 1 + 2 | 2 + 3 | 3 + 1 | + 2 | + 3 | +(6 rows) + +explain (costs off) +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2; + QUERY PLAN +----------------------------------------------------- + GroupAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column1 + Group Key: "*VALUES*".column1 + -> Sort + Sort Key: "*VALUES*".column1 + -> Values Scan on "*VALUES*" +(6 rows) + +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2; + ai1 | ai2 +-----+----- + 1 | 1 + 1 | + 2 | 2 + 2 | + 3 | 3 + 3 | +(6 rows) + +explain (costs off) +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1; + QUERY PLAN +------------------------------------------------------ + Sort + Sort Key: i, i + -> GroupAggregate + Group Key: i, i + Group Key: i + -> Sort + Sort Key: i + -> Function Scan on generate_series i +(8 rows) + +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1; + ai1 | ai2 +-----+----- + 1 | 1 + 2 | 2 + 3 | 3 + 1 | + 2 | + 3 | +(6 rows) + +explain (costs off) +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2; + QUERY PLAN +------------------------------------------------ + GroupAggregate + Group Key: i, i + Group Key: i + -> Sort + Sort Key: i + -> Function Scan on generate_series i +(6 rows) + +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2; + ai1 | ai2 +-----+----- + 1 | 1 + 1 | + 2 | 2 + 2 | + 3 | 3 + 3 | +(6 rows) + +create table gs_dup_alias as +select i from generate_series(1, 3) i; +explain (costs off) +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1; + QUERY PLAN +-------------------------------------------- + Incremental Sort + Sort Key: i, i + Presorted Key: i + -> GroupAggregate + Group Key: i, i + Group Key: i + -> Sort + Sort Key: i + -> Seq Scan on gs_dup_alias +(9 rows) + +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1; + ai1 | ai2 +-----+----- + 1 | 1 + 2 | 2 + 3 | 3 + 1 | + 2 | + 3 | +(6 rows) + +explain (costs off) +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2; + QUERY PLAN +-------------------------------------- + GroupAggregate + Group Key: i, i + Group Key: i + -> Sort + Sort Key: i + -> Seq Scan on gs_dup_alias +(6 rows) + +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2; + ai1 | ai2 +-----+----- + 1 | 1 + 1 | + 2 | 2 + 2 | + 3 | 3 + 3 | +(6 rows) + +reset enable_hashagg; -- end diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql index 90ba27257a..36c9d7a4a8 100644 --- a/src/test/regress/sql/groupingsets.sql +++ b/src/test/regress/sql/groupingsets.sql @@ -589,4 +589,31 @@ explain (costs off) select (select grouping(v1)) from (values ((select 1))) v(v1) group by v1; select (select grouping(v1)) from (values ((select 1))) v(v1) group by v1; +-- test duplicate alias used in multiple grouping set specs and sort ordering +set enable_hashagg=off; +explain (costs off) +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1; +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai2, ai1; +explain (costs off) +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2; +select i as ai1, i as ai2 from (values (1),(2),(3)) v(i) group by ai1, rollup(ai2) order by ai1, ai2; + +explain (costs off) +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1; +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai2, ai1; +explain (costs off) +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2; +select i as ai1, i as ai2 from generate_series(1,3)i group by ai1, rollup(ai2) order by ai1, ai2; + +create table gs_dup_alias as +select i from generate_series(1, 3) i; +explain (costs off) +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1; +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai2, ai1; +explain (costs off) +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2; +select i as ai1, i as ai2 from gs_dup_alias group by ai1, rollup(ai2) order by ai1, ai2; + +reset enable_hashagg; + -- end