On Fri, Nov 29, 2024 at 2:36 PM Andrei Lepikhov <lepi...@gmail.com> wrote: > > > I forgot to do a local commit before sending v8. Fixed in the attached v9. > 1. Thread reference in the patch comment doesn't work. fixed.
I found that unique expression index can also be used for groupby column removal. so I implemented it, aslo added two tests on it. now remove_useless_groupby_columns like: if (index->indexprs == NIL) { --handle unique index } else { --handle unique expression index } what do you think?
From ca9302a8c98f2ad4a8f9837ec42ec2da6588e25d Mon Sep 17 00:00:00 2001 From: jian he <jian.universal...@gmail.com> Date: Fri, 29 Nov 2024 14:54:23 +0800 Subject: [PATCH v10 1/1] remove useless group by columns via unique-not-null In the `remove_useless_groupby_columns` function, use a unique index and a not-null constraint to eliminate unnecessary columns. Here, the primary key is treated as a "unique index and not-null". If there are multiple candidates, use the unique index with the fewest columns to remove the other columns in groupby expression. discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spark --- src/backend/optimizer/plan/initsplan.c | 258 ++++++++++++++++++++++- src/backend/optimizer/plan/planmain.c | 3 + src/backend/optimizer/plan/planner.c | 163 -------------- src/backend/optimizer/util/plancat.c | 1 + src/include/nodes/pathnodes.h | 2 + src/include/optimizer/planmain.h | 1 + src/test/regress/expected/aggregates.out | 79 +++++++ src/test/regress/sql/aggregates.sql | 39 ++++ 8 files changed, 382 insertions(+), 164 deletions(-) diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 903c397d40..bd82745f1e 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * initsplan.c - * Target list, qualification, joininfo initialization routines + * Target list, group by, qualification, joininfo initialization routines * * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -387,6 +387,262 @@ add_vars_to_attr_needed(PlannerInfo *root, List *vars, } +/***************************************************************************** + * + * GROUP BY + * + *****************************************************************************/ + +/* + * remove_useless_groupby_columns + * Remove any columns in the GROUP BY clause that are redundant due to + * being functionally dependent on other GROUP BY columns. + * + * Since some other DBMSes do not allow references to ungrouped columns, it's + * not unusual to find all columns listed in GROUP BY even though listing the + * primary-key columns, or columns of a unique constraint would be sufficient. + * Deleting such excess columns avoids redundant sorting or hashing work, so + * it's worth doing. + * + * Relcache invalidations will ensure that cached plans become invalidated + * when the underlying supporting indexes are dropped or if a column's NOT + * NULL attribute is removed. + */ +void +remove_useless_groupby_columns(PlannerInfo *root) +{ + Query *parse = root->parse; + Bitmapset **groupbyattnos; + Bitmapset **surplusvars; + ListCell *lc; + int relid; + + /* No chance to do anything if there are less than two GROUP BY items */ + if (list_length(root->processed_groupClause) < 2) + return; + + /* Don't fiddle with the GROUP BY clause if the query has grouping sets */ + if (parse->groupingSets) + return; + + /* + * Scan the GROUP BY clause to find GROUP BY items that are simple Vars. + * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k + * that are GROUP BY items. + */ + groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) * + (list_length(parse->rtable) + 1)); + foreach(lc, root->processed_groupClause) + { + SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); + TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); + Var *var = (Var *) tle->expr; + + /* + * Ignore non-Vars and Vars from other query levels. + * + * XXX in principle, stable expressions containing Vars could also be + * removed, if all the Vars are functionally dependent on other GROUP + * BY items. But it's not clear that such cases occur often enough to + * be worth troubling over. + */ + if (!IsA(var, Var) || + var->varlevelsup > 0) + continue; + + /* OK, remember we have this Var */ + relid = var->varno; + Assert(relid <= list_length(parse->rtable)); + groupbyattnos[relid] = bms_add_member(groupbyattnos[relid], + var->varattno - FirstLowInvalidHeapAttributeNumber); + } + + /* + * Consider each relation and see if it is possible to remove some of its + * Vars from GROUP BY. For simplicity and speed, we do the actual removal + * in a separate pass. Here, we just fill surplusvars[k] with a bitmapset + * of the column attnos of RTE k that are removable GROUP BY items. + */ + surplusvars = NULL; /* don't allocate array unless required */ + relid = 0; + foreach(lc, parse->rtable) + { + RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc); + RelOptInfo *rel; + Bitmapset *relattnos; + Bitmapset *best_keycolumns = NULL; + int32 best_nkeycolumns = PG_INT32_MAX; + + relid++; + + /* Only plain relations could have primary-key constraints */ + if (rte->rtekind != RTE_RELATION) + continue; + + /* + * We must skip inheritance parent tables as some of the child rels + * may cause duplicate rows. This cannot happen with partitioned + * tables, however. + */ + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + continue; + + /* Nothing to do unless this rel has multiple Vars in GROUP BY */ + relattnos = groupbyattnos[relid]; + if (bms_membership(relattnos) != BMS_MULTIPLE) + continue; + + rel = root->simple_rel_array[relid]; + + /* + * Now search each index to check if there are any indexes where the + * indexed columns are a subset of the GROUP BY columns for this + * relation. + */ + foreach_node(IndexOptInfo, index, rel->indexlist) + { + Bitmapset *ind_attnos; + bool nulls_check_ok; + + /* + * Skip any non-unique and deferrable indexes. Predicate indexes + * have not been checked yet, so we must skip those too as the + * predOK check might fail. + */ + if (!index->unique || !index->immediate || index->indpred != NIL) + continue; + + ind_attnos = NULL; + nulls_check_ok = true; + if (index->indexprs == NIL) + { + for (int i = 0; i < index->nkeycolumns; i++) + { + /* + * We must insist that the index columns are all defined NOT + * NULL otherwise duplicate NULLs could exists. However, we + * can relax this check when the index is defined with NULLS + * NOT DISTINCT as there can only be 1 NULL row, therefore + * functional dependency on the unique columns is maintained, + * despite the NULL. + */ + if (!index->nullsnotdistinct && + !bms_is_member(index->indexkeys[i], + rel->notnullattnums)) + { + nulls_check_ok = false; + break; + } + + ind_attnos = + bms_add_member(ind_attnos, + index->indexkeys[i] - + FirstLowInvalidHeapAttributeNumber); + } + } + else + { + int attrnos = -1; + + Bitmapset *expr_attrs = NULL; + ListCell *indexpr_item = NULL; + + indexpr_item = list_head(index->indexprs); + pull_varattnos((Node *) lfirst(indexpr_item), 1, &expr_attrs); + + /* + * pull_varattnos returned varattnos is offset by + * FirstLowInvalidHeapAttributeNumber. we need add + * FirstLowInvalidHeapAttributeNumber for comparison with + * rel->notnullattnums + */ + while ((attrnos = bms_next_member(expr_attrs, attrnos)) >= 0) + { + if (!index->nullsnotdistinct && + !bms_is_member(attrnos + FirstLowInvalidHeapAttributeNumber, rel->notnullattnums)) + { + nulls_check_ok = false; + break; + } + } + + ind_attnos = expr_attrs; + } + + if (!nulls_check_ok) + continue; + + /* + * Skip any indexes where the indexed columns aren't a subset of + * the GROUP BY. + */ + if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1) + continue; + + /* + * Record the attribute numbers from the index with the fewest + * columns. This allows the largest number of columns to be + * removed from the GROUP BY clause. In the future, we may wish + * to consider using the narrowest set of columns and looking at + * pg_statistic.stawidth as it might be better to use an index + * with, say two INT4s, rather than, say, one long varlena column. + */ + if (index->nkeycolumns < best_nkeycolumns) + { + best_keycolumns = ind_attnos; + best_nkeycolumns = index->nkeycolumns; + } + } + + /* Did we find a suitable index? */ + if (best_keycolumns != NULL) + { + /* + * To easily remember whether we've found anything to do, we don't + * allocate the surplusvars[] array until we find something. + */ + if (surplusvars == NULL) + surplusvars = + (Bitmapset **) palloc0(sizeof(Bitmapset *) * + (list_length(parse->rtable) + + 1)); + + /* Remember the attnos of the removable columns */ + surplusvars[relid] = bms_difference(relattnos, best_keycolumns); + } + } + + /* + * If we found any surplus Vars, build a new GROUP BY clause without them. + * (Note: this may leave some TLEs with unreferenced ressortgroupref + * markings, but that's harmless.) + */ + if (surplusvars != NULL) + { + List *new_groupby = NIL; + + foreach(lc, root->processed_groupClause) + { + SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); + TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); + Var *var = (Var *) tle->expr; + + /* + * New list must include non-Vars, outer Vars, and anything not + * marked as surplus. + */ + if (!IsA(var, Var) || var->varlevelsup > 0 || + !bms_is_member(var->varattno - + FirstLowInvalidHeapAttributeNumber, + surplusvars[var->varno])) + new_groupby = lappend(new_groupby, sgc); + } + + root->processed_groupClause = new_groupby; + } +} + + /***************************************************************************** * * LATERAL REFERENCES diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index e17d31a5c3..735560e8ca 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -169,6 +169,9 @@ query_planner(PlannerInfo *root, */ add_base_rels_to_query(root, (Node *) parse->jointree); + /* Remove any redundant GROUP BY columns */ + remove_useless_groupby_columns(root); + /* * Examine the targetlist and join tree, adding entries to baserel * targetlists for all referenced Vars, and generating PlaceHolderInfo diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b665a7762e..352c4edf3a 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -139,7 +139,6 @@ static void preprocess_rowmarks(PlannerInfo *root); static double preprocess_limit(PlannerInfo *root, double tuple_fraction, int64 *offset_est, int64 *count_est); -static void remove_useless_groupby_columns(PlannerInfo *root); static List *preprocess_groupclause(PlannerInfo *root, List *force); static List *extract_rollup_sets(List *groupingSets); static List *reorder_grouping_sets(List *groupingSets, List *sortclause); @@ -1487,8 +1486,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, { /* Preprocess regular GROUP BY clause, if any */ root->processed_groupClause = preprocess_groupclause(root, NIL); - /* Remove any redundant GROUP BY columns */ - remove_useless_groupby_columns(root); } /* @@ -2724,166 +2721,6 @@ limit_needed(Query *parse) return false; /* don't need a Limit plan node */ } - -/* - * remove_useless_groupby_columns - * Remove any columns in the GROUP BY clause that are redundant due to - * being functionally dependent on other GROUP BY columns. - * - * Since some other DBMSes do not allow references to ungrouped columns, it's - * not unusual to find all columns listed in GROUP BY even though listing the - * primary-key columns would be sufficient. Deleting such excess columns - * avoids redundant sorting work, so it's worth doing. - * - * Relcache invalidations will ensure that cached plans become invalidated - * when the underlying index of the pkey constraint is dropped. - * - * Currently, we only make use of pkey constraints for this, however, we may - * wish to take this further in the future and also use unique constraints - * which have NOT NULL columns. In that case, plan invalidation will still - * work since relations will receive a relcache invalidation when a NOT NULL - * constraint is dropped. - */ -static void -remove_useless_groupby_columns(PlannerInfo *root) -{ - Query *parse = root->parse; - Bitmapset **groupbyattnos; - Bitmapset **surplusvars; - ListCell *lc; - int relid; - - /* No chance to do anything if there are less than two GROUP BY items */ - if (list_length(root->processed_groupClause) < 2) - return; - - /* Don't fiddle with the GROUP BY clause if the query has grouping sets */ - if (parse->groupingSets) - return; - - /* - * Scan the GROUP BY clause to find GROUP BY items that are simple Vars. - * Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k - * that are GROUP BY items. - */ - groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) * - (list_length(parse->rtable) + 1)); - foreach(lc, root->processed_groupClause) - { - SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); - TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); - Var *var = (Var *) tle->expr; - - /* - * Ignore non-Vars and Vars from other query levels. - * - * XXX in principle, stable expressions containing Vars could also be - * removed, if all the Vars are functionally dependent on other GROUP - * BY items. But it's not clear that such cases occur often enough to - * be worth troubling over. - */ - if (!IsA(var, Var) || - var->varlevelsup > 0) - continue; - - /* OK, remember we have this Var */ - relid = var->varno; - Assert(relid <= list_length(parse->rtable)); - groupbyattnos[relid] = bms_add_member(groupbyattnos[relid], - var->varattno - FirstLowInvalidHeapAttributeNumber); - } - - /* - * Consider each relation and see if it is possible to remove some of its - * Vars from GROUP BY. For simplicity and speed, we do the actual removal - * in a separate pass. Here, we just fill surplusvars[k] with a bitmapset - * of the column attnos of RTE k that are removable GROUP BY items. - */ - surplusvars = NULL; /* don't allocate array unless required */ - relid = 0; - foreach(lc, parse->rtable) - { - RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc); - Bitmapset *relattnos; - Bitmapset *pkattnos; - Oid constraintOid; - - relid++; - - /* Only plain relations could have primary-key constraints */ - if (rte->rtekind != RTE_RELATION) - continue; - - /* - * We must skip inheritance parent tables as some of the child rels - * may cause duplicate rows. This cannot happen with partitioned - * tables, however. - */ - if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) - continue; - - /* Nothing to do unless this rel has multiple Vars in GROUP BY */ - relattnos = groupbyattnos[relid]; - if (bms_membership(relattnos) != BMS_MULTIPLE) - continue; - - /* - * Can't remove any columns for this rel if there is no suitable - * (i.e., nondeferrable) primary key constraint. - */ - pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid); - if (pkattnos == NULL) - continue; - - /* - * If the primary key is a proper subset of relattnos then we have - * some items in the GROUP BY that can be removed. - */ - if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) - { - /* - * To easily remember whether we've found anything to do, we don't - * allocate the surplusvars[] array until we find something. - */ - if (surplusvars == NULL) - surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) * - (list_length(parse->rtable) + 1)); - - /* Remember the attnos of the removable columns */ - surplusvars[relid] = bms_difference(relattnos, pkattnos); - } - } - - /* - * If we found any surplus Vars, build a new GROUP BY clause without them. - * (Note: this may leave some TLEs with unreferenced ressortgroupref - * markings, but that's harmless.) - */ - if (surplusvars != NULL) - { - List *new_groupby = NIL; - - foreach(lc, root->processed_groupClause) - { - SortGroupClause *sgc = lfirst_node(SortGroupClause, lc); - TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList); - Var *var = (Var *) tle->expr; - - /* - * New list must include non-Vars, outer Vars, and anything not - * marked as surplus. - */ - if (!IsA(var, Var) || - var->varlevelsup > 0 || - !bms_is_member(var->varattno - FirstLowInvalidHeapAttributeNumber, - surplusvars[var->varno])) - new_groupby = lappend(new_groupby, sgc); - } - - root->processed_groupClause = new_groupby; - } -} - /* * preprocess_groupclause - do preparatory work on GROUP BY clause * diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 37b0ca2e43..153390f2dc 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -457,6 +457,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->indrestrictinfo = NIL; /* set later, in indxpath.c */ info->predOK = false; /* set later, in indxpath.c */ info->unique = index->indisunique; + info->nullsnotdistinct = index->indnullsnotdistinct; info->immediate = index->indimmediate; info->hypothetical = false; diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index add0f9e45f..0759e00e96 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1187,6 +1187,8 @@ struct IndexOptInfo bool predOK; /* true if a unique index */ bool unique; + /* true if the index was defined with NULLS NOT DISTINCT */ + bool nullsnotdistinct; /* is uniqueness enforced immediately? */ bool immediate; /* true if index doesn't really exist */ diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 93137261e4..0b6f0f7969 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -74,6 +74,7 @@ extern void add_vars_to_targetlist(PlannerInfo *root, List *vars, Relids where_needed); extern void add_vars_to_attr_needed(PlannerInfo *root, List *vars, Relids where_needed); +extern void remove_useless_groupby_columns(PlannerInfo *root); extern void find_lateral_references(PlannerInfo *root); extern void rebuild_lateral_attr_needed(PlannerInfo *root); extern void create_lateral_join_info(PlannerInfo *root); diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 1e682565d1..23890c6e1f 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1448,6 +1448,85 @@ explain (costs off) select * from p_t1 group by a,b,c,d; -> Seq Scan on p_t1_2 (5 rows) +create unique index t3_c_uidx on t3(c); +-- Ensure we don't remove any columns from the GROUP BY for a unique +-- index on a NULLable column. +explain (costs off) select b,c from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b, c + -> Seq Scan on t3 +(3 rows) + +-- Make the column NOT NULL and ensure we remove the redundant column +alter table t3 alter column c set not null; +explain (costs off) select b,c from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + +-- When there are multiple supporting unique indexes and the GROUP BY contains +-- columns to cover all of those, ensure we pick the index with the least +-- number of columns so that we can remove more columns from the GROUP BY. +explain (costs off) select a,b,c from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + +-- As above but try ordering the columns differently to ensure we get the +-- same result. +explain (costs off) select a,b,c from t3 group by c,a,b; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + +-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT +-- NULL constraint on the indexed columns. Ensure the redundant columns are +-- removed from the GROUP BY for such a table. +drop index t3_c_uidx; +alter table t3 alter column c drop not null; +create unique index t3_c_uidx on t3(c) nulls not distinct; +explain (costs off) select b,c from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + +--test unique expression index can be used to remove redundant columns. +drop index t3_c_uidx; +create unique index t3_bc on t3((c+b)); +alter table t3 alter column c set not null; +explain(costs off) select a,b,c from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b, c + -> Seq Scan on t3 +(3 rows) + +--test NULLS NOT DISTINCT with unique expression index +create unique index t3_bc_uidx on t3((c+b)) nulls not distinct; +alter table t3 alter column c drop not null; +explain(costs off) select a,b,c from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b, c + -> Seq Scan on t3 +(3 rows) + drop table t1 cascade; NOTICE: drop cascades to table t1c drop table t2; diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 4885daffe6..b4d1b9b97a 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -507,6 +507,45 @@ create temp table p_t1_2 partition of p_t1 for values in(2); -- Ensure we can remove non-PK columns for partitioned tables. explain (costs off) select * from p_t1 group by a,b,c,d; +create unique index t3_c_uidx on t3(c); + +-- Ensure we don't remove any columns from the GROUP BY for a unique +-- index on a NULLable column. +explain (costs off) select b,c from t3 group by b,c; + +-- Make the column NOT NULL and ensure we remove the redundant column +alter table t3 alter column c set not null; +explain (costs off) select b,c from t3 group by b,c; + +-- When there are multiple supporting unique indexes and the GROUP BY contains +-- columns to cover all of those, ensure we pick the index with the least +-- number of columns so that we can remove more columns from the GROUP BY. +explain (costs off) select a,b,c from t3 group by a,b,c; + +-- As above but try ordering the columns differently to ensure we get the +-- same result. +explain (costs off) select a,b,c from t3 group by c,a,b; + +-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT +-- NULL constraint on the indexed columns. Ensure the redundant columns are +-- removed from the GROUP BY for such a table. +drop index t3_c_uidx; +alter table t3 alter column c drop not null; +create unique index t3_c_uidx on t3(c) nulls not distinct; +explain (costs off) select b,c from t3 group by b,c; + + +--test unique expression index can be used to remove redundant columns. +drop index t3_c_uidx; +create unique index t3_bc on t3((c+b)); +alter table t3 alter column c set not null; +explain(costs off) select a,b,c from t3 group by a,b,c; + +--test NULLS NOT DISTINCT with unique expression index +create unique index t3_bc_uidx on t3((c+b)) nulls not distinct; +alter table t3 alter column c drop not null; +explain(costs off) select a,b,c from t3 group by a,b,c; + drop table t1 cascade; drop table t2; drop table t3; -- 2.34.1