minor changes in get_unique_not_null_attnos: * cosmetic changes * if the returned list length is less than 2, no need sort.
From 58094504364d8a9eb51cbc77a75626aef39b17f0 Mon Sep 17 00:00:00 2001 From: jian he <jian.universal...@gmail.com> Date: Fri, 29 Nov 2024 09:23:27 +0800 Subject: [PATCH v7 1/1] remove useless group by columns via unique-not-null
In the `remove_useless_groupby_columns` function, use a unique index and a not-null constraint to eliminate unnecessary columns. Here, the primary key is treated as a "unique index and not-null". If there are multiple candidates, use the unique index with the fewest columns to remove the other columns in groupby expression. discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spar --- src/backend/catalog/index.c | 149 +++++++++++++++++++++ src/backend/optimizer/plan/planner.c | 53 ++++++-- src/include/catalog/index.h | 1 + src/test/regress/expected/aggregates.out | 59 +++++++- src/test/regress/expected/create_index.out | 11 +- src/test/regress/expected/indexing.out | 11 ++ src/test/regress/sql/aggregates.sql | 26 +++- src/test/regress/sql/create_index.sql | 4 +- src/test/regress/sql/indexing.sql | 3 + src/tools/pgindent/typedefs.list | 1 + 10 files changed, 303 insertions(+), 15 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index f9bb721c5f..2555997c3c 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -56,6 +56,7 @@ #include "commands/progress.h" #include "commands/tablecmds.h" #include "commands/trigger.h" +#include "common/int.h" #include "executor/executor.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -97,6 +98,14 @@ typedef struct Oid pendingReindexedIndexes[FLEXIBLE_ARRAY_MEMBER]; } SerializedReindexState; +/* Struct for sorting list of Bitmapsets in get_unique_not_null_attnos */ +typedef struct +{ + Bitmapset *unique_notnull_attnos; /* Bitmapset contains attribute number + that are unique and not null*/ + Oid indexoid; /*the unique index oid */ +} unique_nn_info; + /* non-export function prototypes */ static bool relationHasPrimaryKey(Relation rel); static TupleDesc ConstructTupleDescriptor(Relation heapRelation, @@ -131,7 +140,19 @@ static void SetReindexProcessing(Oid heapOid, Oid indexOid); static void ResetReindexProcessing(void); static void SetReindexPending(List *indexes); static void RemoveReindexPending(Oid indexOid); +static int unique_notnull_sort_by_idxoid(const void *a, const void *b); +/* + * Comparison function for sorting unique_nn_info in indexoid order. + */ +static int +unique_notnull_sort_by_idxoid(const void *a, const void *b) +{ + const unique_nn_info *unique_a = (const unique_nn_info *) a; + const unique_nn_info *unique_b = (const unique_nn_info *) b; + + return pg_cmp_u32(unique_a->indexoid, unique_b->indexoid); +} /* * relationHasPrimaryKey @@ -4257,3 +4278,131 @@ RestoreReindexState(const void *reindexstate) /* Note the worker has its own transaction nesting level */ reindexingNestLevel = GetCurrentTransactionNestLevel(); } + +/* + * get_unique_not_null_attnos + * Returns a sorted List of Bitmapsets with the attribute numbers (offset + * by FirstLowInvalidHeapAttributeNumber) of all valid non-partial unique + * indexes which contain only non-nullable columns. + */ +List * +get_unique_not_null_attnos(Oid relid) +{ + Bitmapset *not_null_attnos = NULL; + Relation pg_index; + int j = 0; + HeapTuple indexTuple; + SysScanDesc scan; + ScanKeyData skey; + List *not_null_cs; + List *result = NIL; + List *indlist = NIL; + List *indexOidlist = NIL; + unique_nn_info *info = NULL; + ListCell *lc, + *lc2; + + /* Use CookedConstraint to fetch not-null constraint. */ + not_null_cs = RelationGetNotNullConstraints(relid, true, false); + if (not_null_cs == NIL) + return NULL; + + foreach_ptr(CookedConstraint, cc, not_null_cs) + not_null_attnos = bms_add_member(not_null_attnos, cc->attnum - FirstLowInvalidHeapAttributeNumber); + + /* Scan pg_index for unique index of the target rel */ + pg_index = table_open(IndexRelationId, AccessShareLock); + + ScanKeyInit(&skey, + Anum_pg_index_indrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(indexTuple = systable_getnext(scan))) + { + Bitmapset *unique_notnull_attnos = NULL; + Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple); + Datum adatum; + bool isNull; + bool all_notnull = true; + int numkeys; + Datum *keys; + int nelems; + + /* we're only interested in unique index */ + if (!index->indisunique) + continue; + + /* Skip invalid or deferred index */ + if (!index->indisvalid || !index->indimmediate) + continue; + + /* Skip expression index or predicate index */ + if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) || + !heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL)) + continue; + + /* Extract the pg_index->indkey int2vector */ + adatum = heap_getattr(indexTuple, Anum_pg_index_indkey, + RelationGetDescr(pg_index), &isNull); + if (isNull) + elog(ERROR, "null int2vector for index %u", index->indexrelid); + + deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID, + &keys, + NULL, + &nelems); + + if (nelems != index->indnatts) + elog(ERROR, "corrupted int2vector for index %u", index->indexrelid); + + Assert(nelems >= index->indnkeyatts); + numkeys = index->indnkeyatts; + + for (int i = 0; i < numkeys; i++) + { + int attno = DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber; + + /* ignore this index if any columns are NULLable */ + if (!bms_is_member(attno, not_null_attnos)) + { + all_notnull = false; + break; + } + + unique_notnull_attnos = bms_add_member(unique_notnull_attnos, attno); + } + + /* if the index has no NULLable columns, add it to the list */ + if (all_notnull) + { + indlist = lappend(indlist, unique_notnull_attnos); + indexOidlist = lappend_oid(indexOidlist, index->indexrelid); + } + } + systable_endscan(scan); + + table_close(pg_index, AccessShareLock); + + /* no need sort if we only have less than two candicate, return as is */ + if (list_length(indlist) <= 1) + return indlist; + + info = (unique_nn_info *) palloc0(sizeof(unique_nn_info) * list_length(indlist)); + forboth(lc, indlist, lc2, indexOidlist) + { + Bitmapset *bmt = lfirst_node(Bitmapset, lc); + Oid idxoid = lfirst_oid(lc2); + + info[j].unique_notnull_attnos = bmt; + info[j++].indexoid = idxoid; + } + qsort(info, list_length(indlist), sizeof(unique_nn_info), unique_notnull_sort_by_idxoid); + + for(j = 0; j < list_length(indlist); j++) + result = lappend(result, info[j].unique_notnull_attnos); + + return result; +} diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b665a7762e..9bbe843ec9 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -27,6 +27,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" +#include "catalog/index.h" #include "executor/executor.h" #include "foreign/fdwapi.h" #include "jit/jit.h" @@ -2805,8 +2806,9 @@ remove_useless_groupby_columns(PlannerInfo *root) { RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc); Bitmapset *relattnos; - Bitmapset *pkattnos; - Oid constraintOid; + Bitmapset *best_attnos = NULL; + List *unique_notnull_attnos; + int32 best_num_attnos = PG_INT32_MAX; relid++; @@ -2828,18 +2830,47 @@ remove_useless_groupby_columns(PlannerInfo *root) continue; /* - * Can't remove any columns for this rel if there is no suitable - * (i.e., nondeferrable) primary key constraint. + * Fetch details about all suitable unique indexes with NOT NULL + * constraints on all columns. */ - pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid); - if (pkattnos == NULL) - continue; + unique_notnull_attnos = get_unique_not_null_attnos(rte->relid); /* - * If the primary key is a proper subset of relattnos then we have - * some items in the GROUP BY that can be removed. + * Now search each index to check if there are any indexes where the + * indexed columns are a subset of the GROUP BY columns for this + * relation. */ - if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) + foreach_node(Bitmapset,ind_attnos, unique_notnull_attnos) + { + int members; + + /* + * Skip any indexes where the indexed columns aren't a subset of + * the GROUP BY. + */ + if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1) + continue; + + /* + * Record the attribute numbers from the index with the fewest + * columns. This allows the largest number of columns to be + * removed from the GROUP BY clause. In the future, we may wish + * to consider using the narrowest set of columns and looking at + * pg_statistic.stawidth as it might be better to use an index + * with, say two INT4s, rather than, say, one long varlena column. + */ + members = bms_num_members(ind_attnos); + + if (members < best_num_attnos) + { + best_attnos = ind_attnos; + best_num_attnos = members; + } + } + + list_free(unique_notnull_attnos); + + if (best_attnos != NULL) { /* * To easily remember whether we've found anything to do, we don't @@ -2850,7 +2881,7 @@ remove_useless_groupby_columns(PlannerInfo *root) (list_length(parse->rtable) + 1)); /* Remember the attnos of the removable columns */ - surplusvars[relid] = bms_difference(relattnos, pkattnos); + surplusvars[relid] = bms_difference(relattnos, best_attnos); } } diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 2dea96f47c..dacc0f6ebd 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate); extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid); +extern List *get_unique_not_null_attnos(Oid relid); /* * itemptr_encode - Encode ItemPointer as int64/int8 diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 1e682565d1..f6380438f6 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1349,7 +1349,7 @@ LINE 1: select avg((select avg(a1.col1 order by (select avg(a2.col2)... -- create temp table t1 (a int, b int, c int, d int, primary key (a, b)); create temp table t2 (x int, y int, z int, primary key (x, y)); -create temp table t3 (a int, b int, c int, primary key(a, b) deferrable); +create temp table t3 (a int, b int, c int not null, primary key(a, b) deferrable); -- Non-primary-key columns can be removed from GROUP BY explain (costs off) select * from t1 group by a,b,c,d; QUERY PLAN @@ -1407,6 +1407,26 @@ explain (costs off) select * from t3 group by a,b,c; -> Seq Scan on t3 (3 rows) +-- Cannot optimize when unqiue index is deferrable +ALTER TABLE t3 ADD constraint t3_c_unique unique (c) deferrable initially deferred; +explain (costs off) select count(*) from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b, c + -> Seq Scan on t3 +(3 rows) + +--- not-null constraint cannot be deferrable, so reduce columns ok. +ALTER TABLE t3 ADD constraint t3_b_unique unique (b); +explain (costs off) select * from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b + -> Seq Scan on t3 +(3 rows) + create temp table t1c () inherits (t1); -- Ensure we don't remove any columns when t1 has a child table explain (costs off) select * from t1 group by a,b,c,d; @@ -1454,6 +1474,43 @@ drop table t2; drop table t3; drop table p_t1; -- +-- Test removal of redundant GROUP BY columns using unique not null index. +-- +create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c)); +-- using unique index remove useless group by columns +-- since it has less columns compare with primary key +explain (costs off) select * from t2 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t2 +(3 rows) + +drop table t2; +--more than one candicate, choose one will smaller indexoid value. +create temp table t3(a int not null, b int not null, c int, + constraint na unique(a), constraint nb unique(b)); +explain(costs off) select count(*) from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a + -> Seq Scan on t3 +(3 rows) + +alter table t3 drop constraint na; +alter table t3 add constraint na unique(a); +explain(costs off) select count(*) from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b + -> Seq Scan on t3 +(3 rows) + +drop table t3; +-- -- Test GROUP BY matching of join columns that are type-coerced due to USING -- create temp table t1(f1 int, f2 int); diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 1b0a5f0e9e..27acc50007 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1526,10 +1526,19 @@ DROP TABLE concur_heap; -- -- Test ADD CONSTRAINT USING INDEX -- -CREATE TABLE cwi_test( a int , b varchar(10), c char); +CREATE TABLE cwi_test( a int not null, b varchar(10) not null, c char); -- add some data so that all tests have something to work with. INSERT INTO cwi_test VALUES(1, 2), (3, 4), (5, 6); CREATE UNIQUE INDEX cwi_uniq_idx ON cwi_test(a , b); +--test unique index with not-null can be used to remove other groupby columns. +explain (costs off) select count(*) from cwi_test group by a,b,c; + QUERY PLAN +---------------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on cwi_test +(3 rows) + ALTER TABLE cwi_test ADD primary key USING INDEX cwi_uniq_idx; \d cwi_test Table "public.cwi_test" diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index bcf1db11d7..3f67846480 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -1321,6 +1321,17 @@ insert into idxpart2 (c, a, b) values (42, 572814, 'inserted first'); alter table idxpart2 drop column c; create unique index on idxpart (a); alter table idxpart attach partition idxpart2 for values from (100000) to (1000000); +--test using "idxpart_a_idx" to remove useless groupby columns. +explain(costs off) select count(*) from idxpart group by a, b; + QUERY PLAN +-------------------------------------------- + HashAggregate + Group Key: idxpart.a + -> Append + -> Seq Scan on idxpart1 idxpart_1 + -> Seq Scan on idxpart2 idxpart_2 +(5 rows) + insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen'); insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g; ERROR: duplicate key value violates unique constraint "idxpart1_a_idx" diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 4885daffe6..27b57a46c7 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -465,7 +465,7 @@ from tenk1 a2(col2); create temp table t1 (a int, b int, c int, d int, primary key (a, b)); create temp table t2 (x int, y int, z int, primary key (x, y)); -create temp table t3 (a int, b int, c int, primary key(a, b) deferrable); +create temp table t3 (a int, b int, c int not null, primary key(a, b) deferrable); -- Non-primary-key columns can be removed from GROUP BY explain (costs off) select * from t1 group by a,b,c,d; @@ -486,6 +486,12 @@ group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z; -- Cannot optimize when PK is deferrable explain (costs off) select * from t3 group by a,b,c; +-- Cannot optimize when unqiue index is deferrable +ALTER TABLE t3 ADD constraint t3_c_unique unique (c) deferrable initially deferred; +explain (costs off) select count(*) from t3 group by b,c; +--- not-null constraint cannot be deferrable, so reduce columns ok. +ALTER TABLE t3 ADD constraint t3_b_unique unique (b); +explain (costs off) select * from t3 group by a,b,c; create temp table t1c () inherits (t1); -- Ensure we don't remove any columns when t1 has a child table @@ -512,6 +518,24 @@ drop table t2; drop table t3; drop table p_t1; +-- +-- Test removal of redundant GROUP BY columns using unique not null index. +-- +create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c)); +-- using unique index remove useless group by columns +-- since it has less columns compare with primary key +explain (costs off) select * from t2 group by a,b,c; +drop table t2; + +--more than one candicate, choose one will smaller indexoid value. +create temp table t3(a int not null, b int not null, c int, + constraint na unique(a), constraint nb unique(b)); +explain(costs off) select count(*) from t3 group by a,b,c; +alter table t3 drop constraint na; +alter table t3 add constraint na unique(a); +explain(costs off) select count(*) from t3 group by a,b,c; +drop table t3; + -- -- Test GROUP BY matching of join columns that are type-coerced due to USING -- diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index ddd0d9ad39..7d53e5fab2 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -581,13 +581,15 @@ DROP TABLE concur_heap; -- Test ADD CONSTRAINT USING INDEX -- -CREATE TABLE cwi_test( a int , b varchar(10), c char); +CREATE TABLE cwi_test( a int not null, b varchar(10) not null, c char); -- add some data so that all tests have something to work with. INSERT INTO cwi_test VALUES(1, 2), (3, 4), (5, 6); CREATE UNIQUE INDEX cwi_uniq_idx ON cwi_test(a , b); +--test unique index with not-null can be used to remove other groupby columns. +explain (costs off) select count(*) from cwi_test group by a,b,c; ALTER TABLE cwi_test ADD primary key USING INDEX cwi_uniq_idx; \d cwi_test diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index b5cb01c2d7..3d7233feb0 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -705,6 +705,9 @@ insert into idxpart2 (c, a, b) values (42, 572814, 'inserted first'); alter table idxpart2 drop column c; create unique index on idxpart (a); alter table idxpart attach partition idxpart2 for values from (100000) to (1000000); +--test using "idxpart_a_idx" to remove useless groupby columns. +explain(costs off) select count(*) from idxpart group by a, b; + insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen'); insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g; insert into idxpart values (16, 'sixteen'); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b54428b38c..9632bbeefe 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4026,6 +4026,7 @@ unicodeStyleColumnFormat unicodeStyleFormat unicodeStyleRowFormat unicode_linestyle +unique_nn_info unit_conversion unlogged_relation_entry utf_local_conversion_func -- 2.34.1