On Fri, Nov 22, 2024 at 9:24 PM jian he <jian.universal...@gmail.com> wrote: > > overall i come up with the attached patch. in v2:
create table t1 (a int, b int not null, c int, unique(b, c)); explain(costs off) select count(*) from t1 group by b,c,a; QUERY PLAN ---------------------- HashAggregate Group Key: b -> Seq Scan on t1 so the v2 implementation is wrong. I overlooked cases like: only part of the unique-key is not-null. In that situation, we cannot remove useless groupby columns. v3 attached. in v3: QUERY PLAN ---------------------- HashAggregate Group Key: b, c, a -> Seq Scan on t1
From 85a64d350f0282c4bfb79a0f70d57eca29a78308 Mon Sep 17 00:00:00 2001 From: jian he <jian.universal...@gmail.com> Date: Wed, 27 Nov 2024 14:45:03 +0800 Subject: [PATCH v3 1/1] remove useless group by columns via unique-not-null in remove_useless_groupby_columns, if primary key exists and is a subset of group by columns. then we remove non-primary-key column in group-by clause. if primary key is not available, then we try to use columns that are unique and not-null to remove other useless columns. discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spark --- src/backend/catalog/index.c | 94 +++++++++++++++++++++++ src/backend/optimizer/plan/planner.c | 74 +++++++++++++++++- src/include/catalog/index.h | 1 + src/test/regress/expected/aggregates.out | 96 ++++++++++++++++++++++++ src/test/regress/sql/aggregates.sql | 47 ++++++++++++ 5 files changed, 309 insertions(+), 3 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index f9bb721c5f..cd5b1ff3e6 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -4257,3 +4257,97 @@ RestoreReindexState(const void *reindexstate) /* Note the worker has its own transaction nesting level */ reindexingNestLevel = GetCurrentTransactionNestLevel(); } + +/* + * given a input relation oid, return a list of attnum Bitmapset(soure: + * pg_index.indkey) that have unique index *and* not-null constraint associate + * with it. note we do not check this relation's primary key. +*/ +List * +get_unique_not_null_attnos(Oid relid) +{ + List *not_null_attnos = NIL; + Relation pg_index; + HeapTuple indexTuple; + SysScanDesc scan; + ScanKeyData skey; + List *not_null_cs; + List *tlist = NIL; + + /* Use CookedConstraint to fetch not-null constraint. */ + not_null_cs = RelationGetNotNullConstraints(relid, true, false); + if (not_null_cs == NIL) + return NULL; + + foreach_ptr(CookedConstraint, cc, not_null_cs) + not_null_attnos = lappend_int(not_null_attnos, cc->attnum); + + /* Scan pg_index for unique index of the target rel */ + pg_index = table_open(IndexRelationId, AccessShareLock); + + ScanKeyInit(&skey, + Anum_pg_index_indrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(indexTuple = systable_getnext(scan))) + { + Bitmapset *unique_notnull_attnos = NULL; + Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple); + Datum adatum; + bool isNull; + bool all_notnull = true; + int numkeys; + Datum *keys; + int nKeys; + + /* we're only interested in unique index */ + if (!index->indisunique || index->indisprimary) + continue; + + /* Skip invalid, exclusion index or deferred index */ + if (!index->indisvalid || index->indisexclusion || !index->indimmediate) + continue; + + /* Skip expression index or predicate index */ + if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) || + !heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL)) + continue; + + /* Extract the pg_index->indkey int2vector */ + adatum = heap_getattr(indexTuple, Anum_pg_index_indkey, + RelationGetDescr(pg_index), &isNull); + if (isNull) + elog(ERROR, "null int2vector for index %u", index->indexrelid); + + deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID, + &keys, NULL, &nKeys); + + if(nKeys != index->indnatts) + elog(ERROR, "corrupted int2vector for index %u", index->indexrelid); + + Assert(nKeys >= index->indnkeyatts); + numkeys = index->indnkeyatts; + + for (int i = 0; i < numkeys; i++) + { + /* Skip if any of unique key's attnum don't have not-null */ + if (!list_member_int(not_null_attnos, DatumGetInt16(keys[i]))) + { + all_notnull = false; + break; + } + + unique_notnull_attnos = bms_add_member(unique_notnull_attnos, + DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber); + } + if (all_notnull) + tlist = lappend(tlist, unique_notnull_attnos); + } + systable_endscan(scan); + + table_close(pg_index, AccessShareLock); + return tlist; +} \ No newline at end of file diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b665a7762e..39d92b2b53 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -27,6 +27,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" +#include "catalog/index.h" #include "executor/executor.h" #include "foreign/fdwapi.h" #include "jit/jit.h" @@ -2807,6 +2808,9 @@ remove_useless_groupby_columns(PlannerInfo *root) Bitmapset *relattnos; Bitmapset *pkattnos; Oid constraintOid; + Bitmapset *attnums = NULL; + List *unique_notnull_attnos = NIL; + List *matched = NIL; relid++; @@ -2828,18 +2832,69 @@ remove_useless_groupby_columns(PlannerInfo *root) continue; /* - * Can't remove any columns for this rel if there is no suitable - * (i.e., nondeferrable) primary key constraint. + * Can't remove any columns for this rel if there is no suitable (i.e., + * nondeferrable) primary key constraint. However if there is columns + * both have unique-index and not-null constraint, we can also remove + * some columns. If primary key is there, we won't looking for + * unique-not-null constraints. */ pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid); if (pkattnos == NULL) + { + unique_notnull_attnos = get_unique_not_null_attnos(rte->relid); + if (list_length(unique_notnull_attnos) > 0) + { + int out_num, + inner_num; + + foreach_node(Bitmapset, attnos, unique_notnull_attnos) + { + Assert(attnos != NULL); + + if (bms_subset_compare(attnos, relattnos) != BMS_SUBSET1) + continue; + else + matched = lappend(matched, attnos); + } + /* + * There may be multiple unique-index-not-null attnums that are + * subsets of groupbyattnos. We select the one with the fewest + * members. If there are several with the same number of + * members , we choose the first one. We avoid using `foreach` + * here because we delete items from the list. + */ + for (int outerpos = 0; outerpos < list_length(matched); outerpos++) + { + Bitmapset *bt; + bt = list_nth_node(Bitmapset, matched, outerpos); + + out_num = bms_num_members(bt); + for (int restpos = outerpos + 1; restpos < list_length(matched);) + { + Bitmapset *other; + other = list_nth_node(Bitmapset, matched, restpos); + + inner_num = bms_num_members(other); + if (inner_num > out_num) + matched = list_delete_nth_cell(matched, restpos); + else if (inner_num < out_num) + matched = list_delete_nth_cell(matched, outerpos); + restpos++; + } + } + /* If there is at least one candidate, select the first one. */ + if (list_length(matched) > 0) + attnums = list_nth_node(Bitmapset, matched, 0); + } + } + if ((pkattnos == NULL && attnums == NULL)) continue; /* * If the primary key is a proper subset of relattnos then we have * some items in the GROUP BY that can be removed. */ - if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) + if (pkattnos != NULL && bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) { /* * To easily remember whether we've found anything to do, we don't @@ -2852,6 +2907,19 @@ remove_useless_groupby_columns(PlannerInfo *root) /* Remember the attnos of the removable columns */ surplusvars[relid] = bms_difference(relattnos, pkattnos); } + /* + * If attnums is not NULL then this attnums (unique-index-not-null) is a + * proper subset of relattnos. see above. we can remove some items in + * GROUOP BY + */ + else if(attnums != NULL) + { + if (surplusvars == NULL) + surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) * + (list_length(parse->rtable) + 1)); + + surplusvars[relid] = bms_difference(relattnos, attnums); + } } /* diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 2dea96f47c..dacc0f6ebd 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate); extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid); +extern List *get_unique_not_null_attnos(Oid relid); /* * itemptr_encode - Encode ItemPointer as int64/int8 diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 1e682565d1..563b5c1c0d 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1454,6 +1454,102 @@ drop table t2; drop table t3; drop table p_t1; -- +-- Test removal of redundant GROUP BY columns using unique not null index. +-- +create temp table t1 (a int, b int, c int, unique(c)); +create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c)); +create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d)); +create temp table t4 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b), unique(b, c), unique(a, c, d)); +create table t5 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b)) partition by range(a, b); +create table t5_0 (like t5 including all); +create table t5_1 (like t5 including all); +ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10); +ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21); +insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g; +create temp table t6 (a int, b int not null, c int, d int, unique(b, c)); +-- Test unique index without not null constraint should not be used. +explain (costs off) select * from t1 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b, c + -> Seq Scan on t1 +(3 rows) + +-- both unique not null index and primary key is there, +-- using primary key to remove useless group by columns +explain (costs off) select * from t2 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on t2 +(3 rows) + +-- Test primary key beats unique not null index. +explain (costs off) select * from t3 group by a,b,c,d; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on t3 +(3 rows) + +-- Test unique not null indices have overlap. +explain (costs off) select * from t4 group by a,b,c,d; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on t4 +(3 rows) + +--can remove column d from groupby expression, since we have unique index(b,c) +explain (costs off) select count(*) from t4 group by c, d, b; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c, b + -> Seq Scan on t4 +(3 rows) + +--partition case, can reduce to a,b. +explain (costs off) select count(*) from t5 group by a, d, c,b; + QUERY PLAN +----------------------------------- + HashAggregate + Group Key: t5.a, t5.b + -> Append + -> Seq Scan on t5_0 t5_1 + -> Seq Scan on t5_1 t5_2 +(5 rows) + +--can reduce to a,b, groupby order or having duplicates does not matter +explain (costs off) select count(*) from t5 group by b,a,a, d, c; + QUERY PLAN +----------------------------------- + HashAggregate + Group Key: t5.b, t5.a + -> Append + -> Seq Scan on t5_0 t5_1 + -> Seq Scan on t5_1 t5_2 +(5 rows) + +---only part of unique key columns is not-null, cannot use it to remove columns. +explain(costs off) select count(*) from t6 group by b,c,a,d; + QUERY PLAN +------------------------- + HashAggregate + Group Key: b, c, a, d + -> Seq Scan on t6 +(3 rows) + +drop table t1, t2,t3,t4,t5,t6; +-- -- Test GROUP BY matching of join columns that are type-coerced due to USING -- create temp table t1(f1 int, f2 int); diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 4885daffe6..66f4f45953 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -512,6 +512,53 @@ drop table t2; drop table t3; drop table p_t1; +-- +-- Test removal of redundant GROUP BY columns using unique not null index. +-- +create temp table t1 (a int, b int, c int, unique(c)); +create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c)); +create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d)); +create temp table t4 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b), unique(b, c), unique(a, c, d)); + +create table t5 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b)) partition by range(a, b); +create table t5_0 (like t5 including all); +create table t5_1 (like t5 including all); +ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10); +ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21); +insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g; + +create temp table t6 (a int, b int not null, c int, d int, unique(b, c)); + +-- Test unique index without not null constraint should not be used. +explain (costs off) select * from t1 group by a,b,c; + +-- both unique not null index and primary key is there, +-- using primary key to remove useless group by columns +explain (costs off) select * from t2 group by a,b,c; + +-- Test primary key beats unique not null index. +explain (costs off) select * from t3 group by a,b,c,d; + +-- Test unique not null indices have overlap. +explain (costs off) select * from t4 group by a,b,c,d; + +--can remove column d from groupby expression, since we have unique index(b,c) +explain (costs off) select count(*) from t4 group by c, d, b; + +--partition case, can reduce to a,b. +explain (costs off) select count(*) from t5 group by a, d, c,b; + +--can reduce to a,b, groupby order or having duplicates does not matter +explain (costs off) select count(*) from t5 group by b,a,a, d, c; + +---only part of unique key columns is not-null, cannot use it to remove columns. +explain(costs off) select count(*) from t6 group by b,c,a,d; + +drop table t1, t2,t3,t4,t5,t6; -- -- Test GROUP BY matching of join columns that are type-coerced due to USING -- -- 2.34.1