Re: Remove useless GROUP BY columns considering unique index

jian he Thu, 28 Nov 2024 17:27:19 -0800

minor changes in get_unique_not_null_attnos:

* cosmetic changes
* if the returned list length is less than 2, no need sort.

From 58094504364d8a9eb51cbc77a75626aef39b17f0 Mon Sep 17 00:00:00 2001
From: jian he <jian.universal...@gmail.com>
Date: Fri, 29 Nov 2024 09:23:27 +0800
Subject: [PATCH v7 1/1] remove useless group by columns via unique-not-null


In the `remove_useless_groupby_columns` function,
use a unique index and a not-null constraint to eliminate unnecessary columns.
Here, the primary key is treated as a "unique index and not-null".
If there are multiple candidates,
use the unique index with the fewest columns to remove the other columns
in groupby expression.

discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spar
---
 src/backend/catalog/index.c                | 149 +++++++++++++++++++++
 src/backend/optimizer/plan/planner.c       |  53 ++++++--
 src/include/catalog/index.h                |   1 +
 src/test/regress/expected/aggregates.out   |  59 +++++++-
 src/test/regress/expected/create_index.out |  11 +-
 src/test/regress/expected/indexing.out     |  11 ++
 src/test/regress/sql/aggregates.sql        |  26 +++-
 src/test/regress/sql/create_index.sql      |   4 +-
 src/test/regress/sql/indexing.sql          |   3 +
 src/tools/pgindent/typedefs.list           |   1 +
 10 files changed, 303 insertions(+), 15 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..2555997c3c 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -56,6 +56,7 @@
 #include "commands/progress.h"
 #include "commands/tablecmds.h"
 #include "commands/trigger.h"
+#include "common/int.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
@@ -97,6 +98,14 @@ typedef struct
 	Oid			pendingReindexedIndexes[FLEXIBLE_ARRAY_MEMBER];
 } SerializedReindexState;
 
+/* Struct for sorting list of Bitmapsets in get_unique_not_null_attnos */
+typedef struct
+{
+	Bitmapset	*unique_notnull_attnos;	/* Bitmapset contains attribute number
+											that are unique and not null*/
+	Oid			indexoid;				/*the unique index oid */
+} unique_nn_info;
+
 /* non-export function prototypes */
 static bool relationHasPrimaryKey(Relation rel);
 static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
@@ -131,7 +140,19 @@ static void SetReindexProcessing(Oid heapOid, Oid indexOid);
 static void ResetReindexProcessing(void);
 static void SetReindexPending(List *indexes);
 static void RemoveReindexPending(Oid indexOid);
+static int unique_notnull_sort_by_idxoid(const void *a, const void *b);
 
+/*
+ * Comparison function for sorting unique_nn_info in indexoid order.
+ */
+static int
+unique_notnull_sort_by_idxoid(const void *a, const void *b)
+{
+	const unique_nn_info *unique_a = (const unique_nn_info *) a;
+	const unique_nn_info *unique_b = (const unique_nn_info *) b;
+
+	return pg_cmp_u32(unique_a->indexoid, unique_b->indexoid);
+}
 
 /*
  * relationHasPrimaryKey
@@ -4257,3 +4278,131 @@ RestoreReindexState(const void *reindexstate)
 	/* Note the worker has its own transaction nesting level */
 	reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
+
+/*
+ * get_unique_not_null_attnos
+ *		Returns a sorted List of Bitmapsets with the attribute numbers (offset
+ *		by FirstLowInvalidHeapAttributeNumber) of all valid non-partial unique
+ *		indexes which contain only non-nullable columns.
+ */
+List *
+get_unique_not_null_attnos(Oid relid)
+{
+	Bitmapset	*not_null_attnos = NULL;
+	Relation	pg_index;
+	int			j = 0;
+	HeapTuple	indexTuple;
+	SysScanDesc scan;
+	ScanKeyData skey;
+	List 		*not_null_cs;
+	List	   *result = NIL;
+	List	   *indlist = NIL;
+	List	   *indexOidlist = NIL;
+	unique_nn_info *info	= NULL;
+	ListCell   *lc,
+				*lc2;
+
+	/* Use CookedConstraint to fetch not-null constraint. */
+	not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+	if (not_null_cs == NIL)
+		return NULL;
+
+	foreach_ptr(CookedConstraint, cc, not_null_cs)
+		not_null_attnos = bms_add_member(not_null_attnos, cc->attnum - FirstLowInvalidHeapAttributeNumber);
+
+	/* Scan pg_index for unique index of the target rel */
+	pg_index = table_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Bitmapset  	*unique_notnull_attnos = NULL;
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Datum		adatum;
+		bool		isNull;
+		bool		all_notnull = true;
+		int			numkeys;
+		Datum	   *keys;
+		int			nelems;
+
+		/* we're only interested in unique index */
+		if (!index->indisunique)
+			continue;
+
+		/* Skip invalid or deferred index */
+		if (!index->indisvalid || !index->indimmediate)
+			continue;
+
+		/* Skip expression index or predicate index */
+		if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+			!heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+			continue;
+
+		/* Extract the pg_index->indkey int2vector */
+		adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+							  RelationGetDescr(pg_index), &isNull);
+		if (isNull)
+			elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+		deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+								  &keys,
+								  NULL,
+								  &nelems);
+
+		if (nelems != index->indnatts)
+			elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+		Assert(nelems >= index->indnkeyatts);
+		numkeys = index->indnkeyatts;
+
+		for (int i = 0; i < numkeys; i++)
+		{
+			int	attno = DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber;
+
+			/* ignore this index if any columns are NULLable */
+			if (!bms_is_member(attno, not_null_attnos))
+			{
+				all_notnull = false;
+				break;
+			}
+
+			unique_notnull_attnos = bms_add_member(unique_notnull_attnos, attno);
+		}
+
+		/* if the index has no NULLable columns, add it to the list */
+		if (all_notnull)
+		{
+			indlist = lappend(indlist, unique_notnull_attnos);
+			indexOidlist = lappend_oid(indexOidlist, index->indexrelid);
+		}
+	}
+	systable_endscan(scan);
+
+	table_close(pg_index, AccessShareLock);
+
+	/* no need sort if we only have less than two candicate, return as is */
+	if (list_length(indlist) <= 1)
+		return indlist;
+
+	info = (unique_nn_info *) palloc0(sizeof(unique_nn_info) * list_length(indlist));
+	forboth(lc, indlist, lc2, indexOidlist)
+	{
+		Bitmapset *bmt = lfirst_node(Bitmapset, lc);
+		Oid		idxoid = lfirst_oid(lc2);
+
+		info[j].unique_notnull_attnos = bmt;
+		info[j++].indexoid = idxoid;
+	}
+	qsort(info, list_length(indlist), sizeof(unique_nn_info), unique_notnull_sort_by_idxoid);
+
+	for(j = 0; j < list_length(indlist); j++)
+		result = lappend(result, info[j].unique_notnull_attnos);
+
+	return result;
+}
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b665a7762e..9bbe843ec9 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "catalog/index.h"
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -2805,8 +2806,9 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	{
 		RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
 		Bitmapset  *relattnos;
-		Bitmapset  *pkattnos;
-		Oid			constraintOid;
+		Bitmapset  *best_attnos = NULL;
+		List	   *unique_notnull_attnos;
+		int32		best_num_attnos = PG_INT32_MAX;
 
 		relid++;
 
@@ -2828,18 +2830,47 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			continue;
 
 		/*
-		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
+		 * Fetch details about all suitable unique indexes with NOT NULL
+		 * constraints on all columns.
 		 */
-		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
-		if (pkattnos == NULL)
-			continue;
+		unique_notnull_attnos = get_unique_not_null_attnos(rte->relid);
 
 		/*
-		 * If the primary key is a proper subset of relattnos then we have
-		 * some items in the GROUP BY that can be removed.
+		 * Now search each index to check if there are any indexes where the
+		 * indexed columns are a subset of the GROUP BY columns for this
+		 * relation.
 		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		foreach_node(Bitmapset,ind_attnos, unique_notnull_attnos)
+		{
+			int			members;
+
+			/*
+			 * Skip any indexes where the indexed columns aren't a subset of
+			 * the GROUP BY.
+			 */
+			if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1)
+				continue;
+
+			/*
+			 * Record the attribute numbers from the index with the fewest
+			 * columns.  This allows the largest number of columns to be
+			 * removed from the GROUP BY clause.  In the future, we may wish
+			 * to consider using the narrowest set of columns and looking at
+			 * pg_statistic.stawidth as it might be better to use an index
+			 * with, say two INT4s, rather than, say, one long varlena column.
+			 */
+			members = bms_num_members(ind_attnos);
+
+			if (members < best_num_attnos)
+			{
+				best_attnos = ind_attnos;
+				best_num_attnos = members;
+			}
+		}
+
+		list_free(unique_notnull_attnos);
+
+		if (best_attnos != NULL)
 		{
 			/*
 			 * To easily remember whether we've found anything to do, we don't
@@ -2850,7 +2881,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
 													 (list_length(parse->rtable) + 1));
 
 			/* Remember the attnos of the removable columns */
-			surplusvars[relid] = bms_difference(relattnos, pkattnos);
+			surplusvars[relid] = bms_difference(relattnos, best_attnos);
 		}
 	}
 
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..dacc0f6ebd 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
 
 extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
 
+extern List *get_unique_not_null_attnos(Oid relid);
 
 /*
  * itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..f6380438f6 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1349,7 +1349,7 @@ LINE 1: select avg((select avg(a1.col1 order by (select avg(a2.col2)...
 --
 create temp table t1 (a int, b int, c int, d int, primary key (a, b));
 create temp table t2 (x int, y int, z int, primary key (x, y));
-create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
+create temp table t3 (a int, b int, c int not null, primary key(a, b) deferrable);
 -- Non-primary-key columns can be removed from GROUP BY
 explain (costs off) select * from t1 group by a,b,c,d;
       QUERY PLAN      
@@ -1407,6 +1407,26 @@ explain (costs off) select * from t3 group by a,b,c;
    ->  Seq Scan on t3
 (3 rows)
 
+-- Cannot optimize when unqiue index is deferrable
+ALTER TABLE t3 ADD constraint t3_c_unique unique (c) deferrable initially deferred;
+explain (costs off) select count(*) from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: b, c
+   ->  Seq Scan on t3
+(3 rows)
+
+--- not-null constraint cannot be deferrable, so reduce columns ok.
+ALTER TABLE t3 ADD constraint t3_b_unique unique (b);
+explain (costs off) select * from t3 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: b
+   ->  Seq Scan on t3
+(3 rows)
+
 create temp table t1c () inherits (t1);
 -- Ensure we don't remove any columns when t1 has a child table
 explain (costs off) select * from t1 group by a,b,c,d;
@@ -1454,6 +1474,43 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 --
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t2
+(3 rows)
+
+drop table t2;
+--more than one candicate, choose one will smaller indexoid value.
+create temp table t3(a int not null, b int not null, c int,
+                      constraint na unique(a), constraint nb unique(b));
+explain(costs off) select count(*) from t3 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a
+   ->  Seq Scan on t3
+(3 rows)
+
+alter table t3 drop constraint na;
+alter table t3 add constraint na unique(a);
+explain(costs off) select count(*) from t3 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: b
+   ->  Seq Scan on t3
+(3 rows)
+
+drop table t3;
+--
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
 create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 1b0a5f0e9e..27acc50007 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -1526,10 +1526,19 @@ DROP TABLE concur_heap;
 --
 -- Test ADD CONSTRAINT USING INDEX
 --
-CREATE TABLE cwi_test( a int , b varchar(10), c char);
+CREATE TABLE cwi_test( a int not null, b varchar(10) not null, c char);
 -- add some data so that all tests have something to work with.
 INSERT INTO cwi_test VALUES(1, 2), (3, 4), (5, 6);
 CREATE UNIQUE INDEX cwi_uniq_idx ON cwi_test(a , b);
+--test unique index with not-null can be used to remove other groupby columns.
+explain (costs off) select count(*) from cwi_test group by a,b,c;
+         QUERY PLAN         
+----------------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on cwi_test
+(3 rows)
+
 ALTER TABLE cwi_test ADD primary key USING INDEX cwi_uniq_idx;
 \d cwi_test
                      Table "public.cwi_test"
diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out
index bcf1db11d7..3f67846480 100644
--- a/src/test/regress/expected/indexing.out
+++ b/src/test/regress/expected/indexing.out
@@ -1321,6 +1321,17 @@ insert into idxpart2 (c, a, b) values (42, 572814, 'inserted first');
 alter table idxpart2 drop column c;
 create unique index on idxpart (a);
 alter table idxpart attach partition idxpart2 for values from (100000) to (1000000);
+--test using "idxpart_a_idx" to remove useless groupby columns.
+explain(costs off) select count(*) from idxpart group by a, b;
+                 QUERY PLAN                 
+--------------------------------------------
+ HashAggregate
+   Group Key: idxpart.a
+   ->  Append
+         ->  Seq Scan on idxpart1 idxpart_1
+         ->  Seq Scan on idxpart2 idxpart_2
+(5 rows)
+
 insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen');
 insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g;
 ERROR:  duplicate key value violates unique constraint "idxpart1_a_idx"
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..27b57a46c7 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -465,7 +465,7 @@ from tenk1 a2(col2);
 
 create temp table t1 (a int, b int, c int, d int, primary key (a, b));
 create temp table t2 (x int, y int, z int, primary key (x, y));
-create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
+create temp table t3 (a int, b int, c int not null, primary key(a, b) deferrable);
 
 -- Non-primary-key columns can be removed from GROUP BY
 explain (costs off) select * from t1 group by a,b,c,d;
@@ -486,6 +486,12 @@ group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
 -- Cannot optimize when PK is deferrable
 explain (costs off) select * from t3 group by a,b,c;
 
+-- Cannot optimize when unqiue index is deferrable
+ALTER TABLE t3 ADD constraint t3_c_unique unique (c) deferrable initially deferred;
+explain (costs off) select count(*) from t3 group by b,c;
+--- not-null constraint cannot be deferrable, so reduce columns ok.
+ALTER TABLE t3 ADD constraint t3_b_unique unique (b);
+explain (costs off) select * from t3 group by a,b,c;
 create temp table t1c () inherits (t1);
 
 -- Ensure we don't remove any columns when t1 has a child table
@@ -512,6 +518,24 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+drop table t2;
+
+--more than one candicate, choose one will smaller indexoid value.
+create temp table t3(a int not null, b int not null, c int,
+                      constraint na unique(a), constraint nb unique(b));
+explain(costs off) select count(*) from t3 group by a,b,c;
+alter table t3 drop constraint na;
+alter table t3 add constraint na unique(a);
+explain(costs off) select count(*) from t3 group by a,b,c;
+drop table t3;
+
 --
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index ddd0d9ad39..7d53e5fab2 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -581,13 +581,15 @@ DROP TABLE concur_heap;
 -- Test ADD CONSTRAINT USING INDEX
 --
 
-CREATE TABLE cwi_test( a int , b varchar(10), c char);
+CREATE TABLE cwi_test( a int not null, b varchar(10) not null, c char);
 
 -- add some data so that all tests have something to work with.
 
 INSERT INTO cwi_test VALUES(1, 2), (3, 4), (5, 6);
 
 CREATE UNIQUE INDEX cwi_uniq_idx ON cwi_test(a , b);
+--test unique index with not-null can be used to remove other groupby columns.
+explain (costs off) select count(*) from cwi_test group by a,b,c;
 ALTER TABLE cwi_test ADD primary key USING INDEX cwi_uniq_idx;
 
 \d cwi_test
diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql
index b5cb01c2d7..3d7233feb0 100644
--- a/src/test/regress/sql/indexing.sql
+++ b/src/test/regress/sql/indexing.sql
@@ -705,6 +705,9 @@ insert into idxpart2 (c, a, b) values (42, 572814, 'inserted first');
 alter table idxpart2 drop column c;
 create unique index on idxpart (a);
 alter table idxpart attach partition idxpart2 for values from (100000) to (1000000);
+--test using "idxpart_a_idx" to remove useless groupby columns.
+explain(costs off) select count(*) from idxpart group by a, b;
+
 insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen');
 insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g;
 insert into idxpart values (16, 'sixteen');
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b54428b38c..9632bbeefe 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -4026,6 +4026,7 @@ unicodeStyleColumnFormat
 unicodeStyleFormat
 unicodeStyleRowFormat
 unicode_linestyle
+unique_nn_info
 unit_conversion
 unlogged_relation_entry
 utf_local_conversion_func
-- 
2.34.1

Re: Remove useless GROUP BY columns considering unique index

Reply via email to