Re: Remove useless GROUP BY columns considering unique index

jian he Wed, 27 Nov 2024 20:50:22 -0800

On Wed, Nov 27, 2024 at 5:30 PM David Rowley <dgrowle...@gmail.com> wrote:
>
> On Wed, 27 Nov 2024 at 19:51, jian he <jian.universal...@gmail.com> wrote:
> > v3 attached. in v3:
>
> 1. I find the following quite strange:
>
> postgres=# create table abcd (a int primary key, b int not null, c int
> not null, d int not null, unique(b));
> CREATE TABLE
> postgres=# explain select b,c from abcd group by b,c;
>                           QUERY PLAN
> --------------------------------------------------------------
>  HashAggregate  (cost=37.75..56.25 rows=1850 width=8)
>    Group Key: b, c
>    ->  Seq Scan on abcd  (cost=0.00..28.50 rows=1850 width=8)
> (3 rows)
>
>
> postgres=# alter table abcd drop constraint abcd_pkey;
> ALTER TABLE
> postgres=# explain select b,c from abcd group by b,c;
>                           QUERY PLAN
> --------------------------------------------------------------
>  HashAggregate  (cost=33.12..51.62 rows=1850 width=8)
>    Group Key: b
>    ->  Seq Scan on abcd  (cost=0.00..28.50 rows=1850 width=8)
> (3 rows)
>
> Why does the patch only try to remove GROUP BY columns using unique
> indexes when there's no primary key?
>
fixed. aslo have tests on it.


> I don't really see why primary key should be treated specially. Why
> does the patch not just unify the logic and collect up all unique
> non-expression index, non-partial indexes where all columns are NOT
> NULL. You could maybe just add a special case to skip the NOT NULL
> checking for indexes with indisprimary set.
>
now no need to scan pg_constrtraint, just one scan pg_index, collect info.
i aslo removed get_primary_key_attnos.

> 2. In get_unique_not_null_attnos(), not_null_attnos could be a
> Bitmapset with attnums offset by FirstLowInvalidHeapAttributeNumber.
> That'll allow you to do a bms_is_member() call rather than a (possibly
> expensive) list_member_int() call.

fixed

> 3. The logic in remove_useless_groupby_columns() looks much more
> complex than it needs to be. It would be much more simple to find the
> matching unique index with the least number of columns and use that
> one. Just have a counter to record the bms_num_members() of the
> columns of the best match so far and replace it when you find an index
> with fewer columns. Please see adjust_group_pathkeys_for_groupagg()
> for inspiration.
>

now get_unique_not_null_attnos is:
List * get_unique_not_null_attnos(Oid relid, int *pk_pos)

returned pk_pos is pk attnums index (zero based) within the returned list.
current pk_pos no use, but i guess it should be useful.
so we can quickly retrieve pk attnums.

> We also need to think about if the unique index with the least columns
> is always the best one to use. Perhaps the index with the least
> columns is a varlena column and there's some index with, say, two
> INT4s. It would be much cheaper to hash a couple of INT4s than some
> long varlena column. Maybe it's ok just to leave an XXX comment
> explaining that we might want to think about doing that one day.
> Alternatively, summing up the pg_statistic.stawidth values could work.
> Likely it's better to make the patch work correctly before thinking
> about that part.
>
this part has not been addressed yet.

v4 logic is to choose one with the least number of columns.
if there is more than one with the least number of columns, simply
choose the first one
in the matched list.

i think i addressed most of your concern, now the code seems pretty neat.

From d585cf0193a700501a153f75f653aa953e8a9d02 Mon Sep 17 00:00:00 2001
From: jian he <jian.universal...@gmail.com>
Date: Thu, 28 Nov 2024 12:37:22 +0800
Subject: [PATCH v4 1/1] remove useless group by columns via unique-not-null

In the `remove_useless_groupby_columns` function,
use a unique index and a not-null constraint to eliminate unnecessary columns.
Here, the primary key is treated as a "unique index and not-null".
If there are multiple candidates,
use the unique index with the fewest columns to remove the other columns
in groupby expression.

discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spar
---
 src/backend/catalog/index.c              | 101 +++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     |  66 +++++++++-----
 src/include/catalog/index.h              |   1 +
 src/test/regress/expected/aggregates.out | 106 +++++++++++++++++++++++
 src/test/regress/sql/aggregates.sql      |  51 +++++++++++
 5 files changed, 304 insertions(+), 21 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..841522ab4a 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -4257,3 +4257,104 @@ RestoreReindexState(const void *reindexstate)
 	/* Note the worker has its own transaction nesting level */
 	reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
+
+/*
+ * given an input relation oid, return a list of Bitmapset(source:
+ * pg_index.indkey) that has a unique index *and* not-null constraint associated
+ * with it. note this includes the primary key. returned pk_pos is primary key's
+ * attnums index within the returned List
+*/
+List *
+get_unique_not_null_attnos(Oid relid, int *pk_pos)
+{
+	Bitmapset	*not_null_attnos = NULL;
+	Relation	pg_index;
+	HeapTuple	indexTuple;
+	SysScanDesc scan;
+	ScanKeyData skey;
+	List 		*not_null_cs;
+	List	   *tlist = NIL;
+	int			attno;
+
+	/* Use CookedConstraint to fetch not-null constraint. */
+	not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+	if (not_null_cs == NIL)
+		return NULL;
+
+	foreach_ptr(CookedConstraint, cc, not_null_cs)
+		not_null_attnos = bms_add_member(not_null_attnos, cc->attnum - FirstLowInvalidHeapAttributeNumber);
+
+	/* Scan pg_index for unique index of the target rel */
+	pg_index = table_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Bitmapset  	*unique_notnull_attnos = NULL;
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Datum		adatum;
+		bool		isNull;
+		bool		all_notnull = true;
+		int			numkeys;
+		Datum	   *keys;
+		int			nKeys,
+					pos = 0;
+
+		/* we're only interested in unique index */
+		if (!index->indisunique)
+			continue;
+
+		/* Skip invalid or deferred index */
+		if (!index->indisvalid || !index->indimmediate)
+			continue;
+
+		/* Skip expression index or predicate index */
+		if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+			!heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+			continue;
+
+		/* Extract the pg_index->indkey int2vector */
+		adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+							  RelationGetDescr(pg_index), &isNull);
+		if (isNull)
+			elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+		deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+									&keys, NULL, &nKeys);
+
+		if(nKeys != index->indnatts)
+			elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+		Assert(nKeys >= index->indnkeyatts);
+		numkeys = index->indnkeyatts;
+
+		for (int i = 0; i < numkeys; i++)
+		{
+			/* Skip if any of unique key's attnum don't have not-null */
+			attno = DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber;
+			if (!bms_is_member(attno, not_null_attnos))
+			{
+				all_notnull = false;
+				break;
+			}
+			unique_notnull_attnos = bms_add_member(unique_notnull_attnos, attno);
+		}
+		if (all_notnull)
+		{
+			if (index->indisprimary)
+				*pk_pos = pos;
+			pos++;
+			tlist = lappend(tlist, unique_notnull_attnos);
+		}
+	}
+	systable_endscan(scan);
+
+	table_close(pg_index, AccessShareLock);
+	return tlist;
+}
\ No newline at end of file
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b665a7762e..2990e0412f 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "catalog/index.h"
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -2805,8 +2806,10 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	{
 		RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
 		Bitmapset  *relattnos;
-		Bitmapset  *pkattnos;
-		Oid			constraintOid;
+		List  		*unique_notnull_attnos = NIL;
+		List	   	*matched = NIL;
+		Bitmapset 	*best_attnos = NULL;
+		int			pk_pos = -1;
 
 		relid++;
 
@@ -2828,30 +2831,51 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			continue;
 
 		/*
-		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
+		 * Can't remove any columns for this rel if there is no suitable (i.e.,
+		 * nondeferrable) primary key constraint. we can remove some columns if:
+		 * at least one unique index and not-null constraint asscoiated attnums
+		 * is subsset of groupby columns.  here,primary key is treated as unique
+		 * index and not-null constraint.
 		 */
-		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
-		if (pkattnos == NULL)
-			continue;
+		unique_notnull_attnos = get_unique_not_null_attnos(rte->relid, &pk_pos);
 
-		/*
-		 * If the primary key is a proper subset of relattnos then we have
-		 * some items in the GROUP BY that can be removed.
-		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		if (list_length(unique_notnull_attnos) > 0)
 		{
-			/*
-			 * To easily remember whether we've found anything to do, we don't
-			 * allocate the surplusvars[] array until we find something.
-			 */
-			if (surplusvars == NULL)
-				surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
-													 (list_length(parse->rtable) + 1));
+			foreach_node(Bitmapset, attnos, unique_notnull_attnos)
+			{
+				Assert(attnos != NULL);
+				if (bms_subset_compare(attnos, relattnos) != BMS_SUBSET1)
+					continue;
+				else
+					matched = lappend(matched, attnos);
+			}
 
-			/* Remember the attnos of the removable columns */
-			surplusvars[relid] = bms_difference(relattnos, pkattnos);
+			if (list_length(matched) == 0)
+				continue;
+
+			best_attnos = list_nth_node(Bitmapset, matched, 0);
+			for (int i = 0; i < list_length(matched); i++)
+			{
+				Bitmapset *bt = list_nth_node(Bitmapset, matched, i);
+
+				if (bms_num_members(bt) < bms_num_members(best_attnos))
+					best_attnos = bt;
+			}
 		}
+		else
+			continue;
+
+		Assert(best_attnos);
+		/*
+		 * To easily remember whether we've found anything to do, we don't
+		 * allocate the surplusvars[] array until we find something.
+		*/
+		if (surplusvars == NULL)
+			surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+													(list_length(parse->rtable) + 1));
+
+		/* Remember the attnos of the removable columns */
+		surplusvars[relid] = bms_difference(relattnos, best_attnos);
 	}
 
 	/*
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..b171102c76 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
 
 extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
 
+extern List *get_unique_not_null_attnos(Oid relid, int *pk_pos);
 
 /*
  * itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..c60615cfbd 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1454,6 +1454,112 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 --
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+alter table  t5 attach partition t5_0 for values from (0,0) to (10, 10);
+alter table  t5 attach partition t5_1 for values from (10,10) to (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t1
+(3 rows)
+
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t2
+(3 rows)
+
+--can remove column b, since c has unique index and is not-null
+explain (costs off) select count(*) from t2 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t2
+(3 rows)
+
+--more than one candidate can be used to remove other column.
+--optimizer will use primary key or unique index.
+explain (costs off) select * from t3 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.a, t5.b
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.b, t5.a
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+       QUERY PLAN        
+-------------------------
+ HashAggregate
+   Group Key: b, c, a, d
+   ->  Seq Scan on t6
+(3 rows)
+
+drop table t1, t2,t3,t4,t5,t6;
+--
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
 create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..b4b16b29a8 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -512,6 +512,57 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+alter table  t5 attach partition t5_0 for values from (0,0) to (10, 10);
+alter table  t5 attach partition t5_1 for values from (10,10) to (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+
+--can remove column b, since c has unique index and is not-null
+explain (costs off) select count(*) from t2 group by b,c;
+
+--more than one candidate can be used to remove other column.
+--optimizer will use primary key or unique index.
+explain (costs off) select * from t3 group by a,b,c,d;
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+
+drop table t1, t2,t3,t4,t5,t6;
 --
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
-- 
2.34.1

Re: Remove useless GROUP BY columns considering unique index

Reply via email to