Re: Remove useless GROUP BY columns considering unique index

jian he Tue, 26 Nov 2024 22:51:36 -0800

On Fri, Nov 22, 2024 at 9:24 PM jian he <jian.universal...@gmail.com> wrote:
>
> overall i come up with the attached patch.
in v2:


create table t1 (a int, b int not null, c int, unique(b, c));
explain(costs off) select count(*) from t1 group by b,c,a;
      QUERY PLAN
----------------------
 HashAggregate
   Group Key: b
   ->  Seq Scan on t1

so the v2 implementation is wrong.
I overlooked cases like: only part of the unique-key is not-null.
In that situation, we cannot remove useless groupby columns.

v3 attached. in v3:
      QUERY PLAN
----------------------
 HashAggregate
   Group Key: b, c, a
   ->  Seq Scan on t1

From 85a64d350f0282c4bfb79a0f70d57eca29a78308 Mon Sep 17 00:00:00 2001
From: jian he <jian.universal...@gmail.com>
Date: Wed, 27 Nov 2024 14:45:03 +0800
Subject: [PATCH v3 1/1] remove useless group by columns via unique-not-null

in remove_useless_groupby_columns, if primary key exists and is a subset of
group by columns. then we remove non-primary-key column in group-by clause.
if primary key is not available, then we try to use columns that are unique and
not-null to remove other useless columns.

discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spark
---
 src/backend/catalog/index.c              | 94 +++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     | 74 +++++++++++++++++-
 src/include/catalog/index.h              |  1 +
 src/test/regress/expected/aggregates.out | 96 ++++++++++++++++++++++++
 src/test/regress/sql/aggregates.sql      | 47 ++++++++++++
 5 files changed, 309 insertions(+), 3 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..cd5b1ff3e6 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -4257,3 +4257,97 @@ RestoreReindexState(const void *reindexstate)
 	/* Note the worker has its own transaction nesting level */
 	reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
+
+/*
+ * given a input relation oid, return a list of attnum Bitmapset(soure:
+ * pg_index.indkey) that have unique index *and* not-null constraint associate
+ * with it. note we do not check this relation's primary key.
+*/
+List *
+get_unique_not_null_attnos(Oid relid)
+{
+	List	   	*not_null_attnos = NIL;
+	Relation	pg_index;
+	HeapTuple	indexTuple;
+	SysScanDesc scan;
+	ScanKeyData skey;
+	List 		*not_null_cs;
+	List	   *tlist = NIL;
+
+	/* Use CookedConstraint to fetch not-null constraint. */
+	not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+	if (not_null_cs == NIL)
+		return NULL;
+
+	foreach_ptr(CookedConstraint, cc, not_null_cs)
+		not_null_attnos = lappend_int(not_null_attnos, cc->attnum);
+
+	/* Scan pg_index for unique index of the target rel */
+	pg_index = table_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Bitmapset  	*unique_notnull_attnos = NULL;
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Datum		adatum;
+		bool		isNull;
+		bool		all_notnull = true;
+		int			numkeys;
+		Datum	   *keys;
+		int			nKeys;
+
+		/* we're only interested in unique index */
+		if (!index->indisunique || index->indisprimary)
+			continue;
+
+		/* Skip invalid, exclusion index or deferred index */
+		if (!index->indisvalid || index->indisexclusion || !index->indimmediate)
+			continue;
+
+		/* Skip expression index or predicate index */
+		if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+			!heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+			continue;
+
+		/* Extract the pg_index->indkey int2vector */
+		adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+							  RelationGetDescr(pg_index), &isNull);
+		if (isNull)
+			elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+		deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+									&keys, NULL, &nKeys);
+
+		if(nKeys != index->indnatts)
+			elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+		Assert(nKeys >= index->indnkeyatts);
+		numkeys = index->indnkeyatts;
+
+		for (int i = 0; i < numkeys; i++)
+		{
+			/* Skip if any of unique key's attnum don't have not-null */
+			if (!list_member_int(not_null_attnos, DatumGetInt16(keys[i])))
+			{
+				all_notnull = false;
+				break;
+			}
+
+			unique_notnull_attnos = bms_add_member(unique_notnull_attnos,
+									  DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber);
+		}
+		if (all_notnull)
+			tlist = lappend(tlist, unique_notnull_attnos);
+	}
+	systable_endscan(scan);
+
+	table_close(pg_index, AccessShareLock);
+	return tlist;
+}
\ No newline at end of file
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b665a7762e..39d92b2b53 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "catalog/index.h"
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -2807,6 +2808,9 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		Bitmapset  *relattnos;
 		Bitmapset  *pkattnos;
 		Oid			constraintOid;
+		Bitmapset  	*attnums = NULL;
+		List  		*unique_notnull_attnos = NIL;
+		List	   	*matched = NIL;
 
 		relid++;
 
@@ -2828,18 +2832,69 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			continue;
 
 		/*
-		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
+		 * Can't remove any columns for this rel if there is no suitable (i.e.,
+		 * nondeferrable) primary key constraint. However if there is columns
+		 * both have unique-index and not-null constraint, we can also remove
+		 * some columns. If primary key is there, we won't looking for
+		 * unique-not-null constraints.
 		 */
 		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
 		if (pkattnos == NULL)
+		{
+			unique_notnull_attnos = get_unique_not_null_attnos(rte->relid);
+			if (list_length(unique_notnull_attnos) > 0)
+			{
+				int		out_num,
+						inner_num;
+
+				foreach_node(Bitmapset, attnos, unique_notnull_attnos)
+				{
+					Assert(attnos != NULL);
+
+					if (bms_subset_compare(attnos, relattnos) != BMS_SUBSET1)
+						continue;
+					else
+						matched = lappend(matched, attnos);
+				}
+				/*
+				 * There may be multiple unique-index-not-null attnums that are
+				 * subsets of groupbyattnos.  We select the one with the fewest
+				 * members.  If there are several with the same number of
+				 * members , we choose the first one. We avoid using `foreach`
+				 * here because we delete items from the list.
+				*/
+				for (int outerpos = 0; outerpos < list_length(matched); outerpos++)
+				{
+					Bitmapset *bt;
+					bt = list_nth_node(Bitmapset, matched, outerpos);
+
+					out_num = bms_num_members(bt);
+					for (int restpos = outerpos + 1; restpos < list_length(matched);)
+					{
+						Bitmapset *other;
+						other = list_nth_node(Bitmapset, matched, restpos);
+
+						inner_num =	bms_num_members(other);
+						if (inner_num > out_num)
+							matched = list_delete_nth_cell(matched, restpos);
+						else if (inner_num < out_num)
+							matched = list_delete_nth_cell(matched, outerpos);
+						restpos++;
+					}
+				}
+				/* If there is at least one candidate, select the first one. */
+				if (list_length(matched) > 0)
+					attnums = list_nth_node(Bitmapset, matched, 0);
+			}
+		}
+		if ((pkattnos == NULL && attnums == NULL))
 			continue;
 
 		/*
 		 * If the primary key is a proper subset of relattnos then we have
 		 * some items in the GROUP BY that can be removed.
 		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		if (pkattnos != NULL && bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
 		{
 			/*
 			 * To easily remember whether we've found anything to do, we don't
@@ -2852,6 +2907,19 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			/* Remember the attnos of the removable columns */
 			surplusvars[relid] = bms_difference(relattnos, pkattnos);
 		}
+		/*
+		 * If attnums is not NULL then this attnums (unique-index-not-null) is a
+		 * proper subset of relattnos. see above. we can remove some items in
+		 * GROUOP BY
+		 */
+		else if(attnums != NULL)
+		{
+			if (surplusvars == NULL)
+				surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+													 (list_length(parse->rtable) + 1));
+
+			surplusvars[relid] = bms_difference(relattnos, attnums);
+		}
 	}
 
 	/*
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..dacc0f6ebd 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
 
 extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
 
+extern List *get_unique_not_null_attnos(Oid relid);
 
 /*
  * itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..563b5c1c0d 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1454,6 +1454,102 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 --
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t1
+(3 rows)
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t2
+(3 rows)
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.a, t5.b
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.b, t5.a
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+       QUERY PLAN        
+-------------------------
+ HashAggregate
+   Group Key: b, c, a, d
+   ->  Seq Scan on t6
+(3 rows)
+
+drop table t1, t2,t3,t4,t5,t6;
+--
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
 create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..66f4f45953 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -512,6 +512,53 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+
+drop table t1, t2,t3,t4,t5,t6;
 --
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
-- 
2.34.1

Re: Remove useless GROUP BY columns considering unique index

Reply via email to