Re: Remove useless GROUP BY columns considering unique index

jian he Fri, 22 Nov 2024 05:24:40 -0800

On Thu, Sep 12, 2024 at 9:44 AM David Rowley <dgrowle...@gmail.com> wrote:
>
> On Sat, 30 Dec 2023 at 04:05, Zhang Mingli <zmlpostg...@gmail.com> wrote:
> > So my patch make it easy: check unique index’s columns, it’s a valid 
> > candidate if all of that have NOT NULL constraint.
> > And we choose a best one who has the least column numbers in 
> > get_min_unique_not_null_attnos(), as the reason: less columns mean that 
> > more group by columns could be removed.
>
> This patch no longer applies.  We no longer catalogue NOT NULL
> constraints, which this patch is coded to rely upon. (Likely it could
> just look at pg_attribute.attnotnull instead)
>
> Aside from that, I'm not sure about the logic to prefer removing
> functionally dependent columns using the constraint with the least
> columns.  If you had the PRIMARY KEY on two int columns and a unique
> index on a 1MB varlena column, I think using the primary key would be
> a better choice to remove functionally dependent columns of.  Maybe
> it's ok to remove any functionally dependant columns on the primary
> key first and try removing functionally dependent columns on unique
> constraints and a 2nd step (or maybe only if none can be removed using
> the PK?)
>



Let's be conservative.
if the primary key is there, then using it to
remove other useless columns;
if not there. found out the unique-index-not-null columns,
using it to remove other columns in the groupby clause.


> Also, why constraints rather than unique indexes?  You'd need to check
> for expression indexes and likely ignore those due to no ability to
> detect NOT NULL for expressions.
>
some unique indexes don't have pg_constraint entry.
for example:
create table t(a int);
create unique index idx_t on t(a);


overall i come up with the attached patch.
instead of loop through pg_constraint, i loop over pg_index, extract indkey.
multiple indkey can match again groupby expression.
doing some loop, found out the indkey that has a minimum number of columns.
for example:
unique-not-null columns can be
(a,b)
(a,b,c)
since (a,b) only has two columns, using (a,b) to remove other columns
in the groupby expression.


some tests are copied from Zhang Mingli.
Code implementation is quite different.

From 6b6689038de51fc7c56f710f9b613c1998ea2bd1 Mon Sep 17 00:00:00 2001
From: jian he <jian.universal...@gmail.com>
Date: Fri, 22 Nov 2024 21:08:59 +0800
Subject: [PATCH v2 1/1] remove useless group by columns via unique index

in remove_useless_groupby_columns, if primary key exists and is a subset of
group by columns. then we remove non-primary-key column in group-by clause.  if
primary key is not available, then we try to use columns that aer unique and
not-null to remove other useless columns.

discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spark
---
 src/backend/catalog/index.c              | 99 ++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     | 66 +++++++++++++++-
 src/include/catalog/index.h              |  1 +
 src/test/regress/expected/aggregates.out | 75 ++++++++++++++++++
 src/test/regress/sql/aggregates.sql      | 39 ++++++++++
 5 files changed, 278 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..ad97925278 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -4257,3 +4257,102 @@ RestoreReindexState(const void *reindexstate)
 	/* Note the worker has its own transaction nesting level */
 	reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
+
+/*
+ * given a input relation oid, return a list of attnums that have unique index
+ * and not-null constraint associated with it. note we do not check this
+ * relation's primary key.
+*/
+List *
+get_unique_not_null_attnos(Oid relid)
+{
+	List	   	*not_null_attnos = NIL;
+	Relation	pg_index;
+	HeapTuple	indexTuple;
+	SysScanDesc scan;
+	ScanKeyData skey;
+	List 		*not_null_cs;
+	List	   *tlist = NIL;
+
+	/*
+	 * Use cooked to fetch attnos. note: parimary key have a seperated not-null
+	 * constraint
+	 * */
+	not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+	if (not_null_cs == NIL)
+		return NULL;
+
+	foreach_ptr(CookedConstraint, cc, not_null_cs)
+		not_null_attnos = lappend_int(not_null_attnos, cc->attnum);
+
+	/* Scan pg_index for unique index of the target rel */
+	pg_index = table_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Bitmapset  	*unique_notnull_attnos = NULL;
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Datum		adatum;
+		bool		isNull;
+		int			numkeys;
+		bool 		all_notnull;
+		Datum	   *keys;
+		int			nKeys;
+
+		/* we're only interested in unique index */
+		if (!index->indisunique || index->indisprimary)
+			continue;
+
+		/* Skip invalid, exclusion index or deferred index */
+		if (!index->indisvalid || index->indisexclusion || !index->indimmediate)
+			continue;
+
+		/* Skip expression index or predicate index */
+		if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+			!heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+			continue;
+
+		/* Extract the pg_index->indkey int2vector */
+		adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+							  RelationGetDescr(pg_index), &isNull);
+		if (isNull)
+			elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+		deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+									&keys, NULL, &nKeys);
+		if(nKeys != index->indnatts)
+			elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+		Assert(nKeys >= index->indnkeyatts);
+		numkeys = index->indnkeyatts;
+
+		all_notnull = true;
+		for (int i = 0; i < numkeys; i++)
+		{
+			/* Skip if unique key attnum don't have not-null */
+			if (!list_member_int(not_null_attnos, DatumGetInt16(keys[i])))
+			{
+				all_notnull = false;
+				break;
+			}
+			unique_notnull_attnos = bms_add_member(unique_notnull_attnos,
+									  DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber);
+		}
+		if (unique_notnull_attnos != NULL)
+			tlist = lappend(tlist, unique_notnull_attnos);
+
+		if(!all_notnull)
+			continue;
+	}
+	systable_endscan(scan);
+
+	table_close(pg_index, AccessShareLock);
+	return tlist;
+}
\ No newline at end of file
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 1f78dc3d53..4341ac9aae 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "catalog/index.h"
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -2797,6 +2798,9 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		Bitmapset  *relattnos;
 		Bitmapset  *pkattnos;
 		Oid			constraintOid;
+		Bitmapset  	*attnums = NULL;
+		List  		*unique_notnull_attnos = NIL;
+		List	   	*matched = NIL;
 
 		relid++;
 
@@ -2819,17 +2823,67 @@ remove_useless_groupby_columns(PlannerInfo *root)
 
 		/*
 		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
+		 * (i.e., nondeferrable) primary key constraint.However if there is
+		 * columns both have unique-index and not-null constraint, we can also
+		 * remove some columns. If primary key is there, we won't looking for
+		 * unique-not-null columns.
 		 */
 		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
 		if (pkattnos == NULL)
+		{
+			/* get unique not null index */
+			unique_notnull_attnos = get_unique_not_null_attnos(rte->relid);
+			if (list_length(unique_notnull_attnos) > 0)
+			{
+				int		out_num,
+						inner_num;
+
+				foreach_node(Bitmapset, attnos, unique_notnull_attnos)
+				{
+					Assert(attnos != NULL);
+
+					if (bms_subset_compare(attnos, relattnos) != BMS_SUBSET1)
+						continue;
+					else
+						matched = lappend(matched, attnos);
+				}
+
+				/* there can be many unique-index-not-null attnums that are
+				* subset of groupbyattnos.  we choose one that is less number of
+				* members.if there are many equal number of attnums, we simply
+				* choose the first one.  we don't use foreach here, since we
+				* delete cells in list
+				*/
+				for (int outerpos = 0; outerpos < list_length(matched); outerpos++)
+				{
+					Bitmapset *bt;
+					bt = (Bitmapset *) list_nth(matched, outerpos);
+
+					out_num = bms_num_members(bt);
+					for (int restpos = outerpos + 1; restpos < list_length(matched);)
+					{
+						Bitmapset *other;
+						other = (Bitmapset *) list_nth(matched, restpos);
+						inner_num =	bms_num_members(other);
+						if (inner_num > out_num)
+							matched = list_delete_nth_cell(matched, restpos);
+						else if (inner_num < out_num)
+							matched = list_delete_nth_cell(matched, outerpos);
+						restpos++;
+					}
+				}
+				if (list_length(matched) > 0)
+					attnums = (Bitmapset *) list_nth(matched, 0);
+			}
+		}
+		if ((pkattnos == NULL && attnums == NULL))
 			continue;
 
 		/*
 		 * If the primary key is a proper subset of relattnos then we have
 		 * some items in the GROUP BY that can be removed.
 		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		if (pkattnos != NULL && bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
 		{
 			/*
 			 * To easily remember whether we've found anything to do, we don't
@@ -2842,6 +2896,14 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			/* Remember the attnos of the removable columns */
 			surplusvars[relid] = bms_difference(relattnos, pkattnos);
 		}
+		else if(attnums != NULL)
+		{
+			if (surplusvars == NULL)
+				surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+													 (list_length(parse->rtable) + 1));
+
+			surplusvars[relid] = bms_difference(relattnos, attnums);
+		}
 	}
 
 	/*
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..dacc0f6ebd 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
 
 extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
 
+extern List *get_unique_not_null_attnos(Oid relid);
 
 /*
  * itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..9bd91c4f38 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1454,6 +1454,81 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 --
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t1
+(3 rows)
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t2
+(3 rows)
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.a, t5.b
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+drop table t1, t2,t3,t4,t5;
+--
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
 create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..d07c599a04 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -512,6 +512,45 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+
+drop table t1, t2,t3,t4,t5;
 --
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
-- 
2.34.1

Re: Remove useless GROUP BY columns considering unique index

Reply via email to