Re: CLUSTER on partitioned index

Justin Pryzby Sat, 08 Jan 2022 11:55:45 -0800

On Fri, Dec 03, 2021 at 10:16:24AM +0900, Michael Paquier wrote:
> On Thu, Sep 23, 2021 at 06:56:26PM -0500, Justin Pryzby wrote:
> > On Thu, Sep 23, 2021 at 08:18:41PM +0200, Matthias van de Meent wrote:
> >> Note: The following review is based on the assumption that this v11
> >> revision was meant to contain only one patch. I put this up as a note,
> >> because it seemed quite limited when compared to earlier versions of
> >> the patchset.
> > 
> > Alvaro's critique was that the patchset was too complicated for what was
> > claimed to be a marginal feature.  My response was to rearrange the 
> > patchset to
> > its minimal form, supporting CLUSTER without marking the index as clustered.
> >
> > My goal is to present a minimal patch and avoid any nonessential complexity.
> 
> FWIW, my opinion on the matter is similar to Alvaro's, and an extra
> read of the patch gives me the same impression.  Let's see if others
> have an opinion on the matter.


You and Alvaro thought the patch was too complicated for its value, so I
reduced the scope to its essential form.

CLUSTER was claimed to be of marginal utility, since the table is locked.
But locking one partition at a time would be less disruptive than locking an
equivalent non-partitioned table.

There's only about a dozen, other remaining restrictions/limitations on
partitioned tables, (AMs, triggers, identity, generated, exclusion, CIC,
FREEZE).  

Since it's supported to VACUUM (including VACUUM FULL) and REINDEX a
partitioned table, I'm still suprised there's much hesitation to support
CLUSTER (which is used by vacuum full).

> > Thanks for looking.  I'm going to see about updating comments based on
> > corresponding parts of vacuum and on this message itself.
> 
> It doesn't feel right to just discard the patch at this stage, and it
> needs an update, so I have moved it to the next CF for now, waiting on
> author.  If this does not really move on, my suggestion is to discard
> the patch at the end of next CF, aka 2022-01.

This includes minor updates based on Mathias review (commit message and test
case).

-- 
Justin

>From cbeb98a016a05738007c98ef0a3828bdedf83d24 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <[email protected]>
Date: Sun, 7 Jun 2020 16:58:42 -0500
Subject: [PATCH v11] Implement CLUSTER of partitioned table..

VACUUM (including vacuum full) has recursed into partitioned tables since
partitioning were introduced in v10 (3c3bb9933).  See expand_vacuum_rel().

For VACUUM FULL, vacuum_rel() calls cluster_rel() for each partition.
This patch is to make cluster() do all the same stuff before calling
cluster_rel().

For now, partitioned indexes cannot be marked clustered, so clustering requires
specification of a partitioned index on the partitioned table.

See also a556549d7 and 19de0ab23.
---
 doc/src/sgml/ref/cluster.sgml         |   6 +
 src/backend/commands/cluster.c        | 177 ++++++++++++++++++++------
 src/bin/psql/tab-complete.c           |   1 +
 src/include/commands/cluster.h        |   1 +
 src/test/regress/expected/cluster.out |  45 ++++++-
 src/test/regress/sql/cluster.sql      |  21 ++-
 6 files changed, 204 insertions(+), 47 deletions(-)

diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml
index 86f5fdc469b..b3463ae5c46 100644
--- a/doc/src/sgml/ref/cluster.sgml
+++ b/doc/src/sgml/ref/cluster.sgml
@@ -196,6 +196,12 @@ CLUSTER [VERBOSE]
     in the <structname>pg_stat_progress_cluster</structname> view. See
     <xref linkend="cluster-progress-reporting"/> for details.
   </para>
+
+   <para>
+    Clustering a partitioned table clusters each of its partitions using the
+    partition of the specified partitioned index (which must be specified).
+   </para>
+
  </refsect1>
 
  <refsect1>
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 61853e6dec4..196c7f7eeb2 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -32,7 +32,9 @@
 #include "catalog/index.h"
 #include "catalog/namespace.h"
 #include "catalog/objectaccess.h"
+#include "catalog/partition.h"
 #include "catalog/pg_am.h"
+#include "catalog/pg_inherits.h"
 #include "catalog/toasting.h"
 #include "commands/cluster.h"
 #include "commands/defrem.h"
@@ -73,6 +75,9 @@ static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 							bool verbose, bool *pSwapToastByContent,
 							TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
 static List *get_tables_to_cluster(MemoryContext cluster_context);
+static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context,
+		Oid indexOid);
+static void cluster_multiple_rels(List *rvs, int options);
 
 
 /*---------------------------------------------------------------------------
@@ -131,6 +136,7 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 		Relation	rel;
 
 		/* Find, lock, and check permissions on the table */
+		/* Obtain AEL now to avoid lock-upgrade hazard in the single-transaction case */
 		tableOid = RangeVarGetRelidExtended(stmt->relation,
 											AccessExclusiveLock,
 											0,
@@ -146,14 +152,6 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("cannot cluster temporary tables of other sessions")));
 
-		/*
-		 * Reject clustering a partitioned table.
-		 */
-		if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("cannot cluster a partitioned table")));
-
 		if (stmt->indexname == NULL)
 		{
 			ListCell   *index;
@@ -188,11 +186,50 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 								stmt->indexname, stmt->relation->relname)));
 		}
 
-		/* close relation, keep lock till commit */
-		table_close(rel, NoLock);
+		if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		{
+			/* close relation, keep lock till commit */
+			table_close(rel, NoLock);
 
-		/* Do the job. */
-		cluster_rel(tableOid, indexOid, &params);
+			/* Do the job. */
+			cluster_rel(tableOid, indexOid, &params);
+		}
+		else
+		{
+			List	   *rvs;
+			MemoryContext cluster_context;
+
+			/*
+			 * Expand partitioned relations for CLUSTER (the corresponding
+			 * thing for VACUUM FULL happens in and around expand_vacuum_rel()
+			 */
+
+			/* Refuse to hold strong locks in a user transaction */
+			PreventInTransactionBlock(isTopLevel, "CLUSTER");
+
+			cluster_context = AllocSetContextCreate(PortalContext,
+												"Cluster",
+												ALLOCSET_DEFAULT_SIZES);
+
+			rvs = get_tables_to_cluster_partitioned(cluster_context, indexOid);
+
+			/*
+			 * For now, partitioned indexes are not actually marked clustered.
+			 */
+			check_index_is_clusterable(rel, indexOid, true, AccessShareLock);
+
+			/* close relation, releasing lock on parent table */
+			table_close(rel, AccessExclusiveLock);
+
+			/* Do the job. */
+			cluster_multiple_rels(rvs, params.options);
+
+			/* Start a new transaction for the cleanup work. */
+			StartTransactionCommand();
+
+			/* Clean up working storage */
+			MemoryContextDelete(cluster_context);
+		}
 	}
 	else
 	{
@@ -202,7 +239,6 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 		 */
 		MemoryContext cluster_context;
 		List	   *rvs;
-		ListCell   *rv;
 
 		/*
 		 * We cannot run this form of CLUSTER inside a user transaction block;
@@ -225,28 +261,7 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 		 * cluster_context.
 		 */
 		rvs = get_tables_to_cluster(cluster_context);
-
-		/* Commit to get out of starting transaction */
-		PopActiveSnapshot();
-		CommitTransactionCommand();
-
-		/* Ok, now that we've got them all, cluster them one by one */
-		foreach(rv, rvs)
-		{
-			RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
-			ClusterParams cluster_params = params;
-
-			/* Start a new transaction for each relation. */
-			StartTransactionCommand();
-			/* functions in indexes may want a snapshot set */
-			PushActiveSnapshot(GetTransactionSnapshot());
-			/* Do the job. */
-			cluster_params.options |= CLUOPT_RECHECK;
-			cluster_rel(rvtc->tableOid, rvtc->indexOid,
-						&cluster_params);
-			PopActiveSnapshot();
-			CommitTransactionCommand();
-		}
+		cluster_multiple_rels(rvs, params.options | CLUOPT_RECHECK_ISCLUSTERED);
 
 		/* Start a new transaction for the cleanup work. */
 		StartTransactionCommand();
@@ -327,10 +342,11 @@ cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 		/*
 		 * Silently skip a temp table for a remote session.  Only doing this
 		 * check in the "recheck" case is appropriate (which currently means
-		 * somebody is executing a database-wide CLUSTER), because there is
-		 * another check in cluster() which will stop any attempt to cluster
-		 * remote temp tables by name.  There is another check in cluster_rel
-		 * which is redundant, but we leave it for extra safety.
+		 * somebody is executing a database-wide CLUSTER or on a partitioned
+		 * table), because there is another check in cluster() which will stop
+		 * any attempt to cluster remote temp tables by name.  There is another
+		 * check in cluster_rel which is redundant, but we leave it for extra
+		 * safety.
 		 */
 		if (RELATION_IS_OTHER_TEMP(OldHeap))
 		{
@@ -352,9 +368,10 @@ cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 			}
 
 			/*
-			 * Check that the index is still the one with indisclustered set.
+			 * Check that the index is still the one with indisclustered set, if needed.
 			 */
-			if (!get_index_isclustered(indexOid))
+			if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 &&
+					!get_index_isclustered(indexOid))
 			{
 				relation_close(OldHeap, AccessExclusiveLock);
 				pgstat_progress_end_command();
@@ -415,6 +432,9 @@ cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 		return;
 	}
 
+	// Should silently skip this rather than assert ?
+	Assert(RELKIND_HAS_STORAGE(OldHeap->rd_rel->relkind));
+
 	/*
 	 * All predicate locks on the tuples or pages are about to be made
 	 * invalid, because we move tuples around.  Promote them to relation
@@ -585,8 +605,8 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
 	TransactionId frozenXid;
 	MultiXactId cutoffMulti;
 
-	/* Mark the correct index as clustered */
 	if (OidIsValid(indexOid))
+		/* Mark the correct index as clustered */
 		mark_index_clustered(OldHeap, indexOid, true);
 
 	/* Remember info about rel before closing OldHeap */
@@ -1604,3 +1624,76 @@ get_tables_to_cluster(MemoryContext cluster_context)
 
 	return rvs;
 }
+
+/*
+ * Return a List of tables and their associated indexes, where each index is a
+ * partition of the given index
+ * We're called with a lock held on the parent table.
+ */
+static List *
+get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid)
+{
+	List		*inhoids;
+	ListCell	*lc;
+	List		*rvs = NIL;
+	MemoryContext	old_context;
+
+	/* Do not lock the children until they're processed. */
+	inhoids = find_all_inheritors(indexOid, NoLock, NULL);
+
+	/*
+	 * We have to build the list in a different memory context so it will
+	 * survive the cross-transaction processing
+	 */
+	old_context = MemoryContextSwitchTo(cluster_context);
+
+	foreach(lc, inhoids)
+	{
+		Oid		indexrelid = lfirst_oid(lc);
+		Oid		relid = IndexGetRelation(indexrelid, false);
+		RelToCluster	*rvtc;
+
+		if (!RELKIND_HAS_STORAGE(get_rel_relkind(indexrelid)))
+			continue;
+
+		rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
+		rvtc->tableOid = relid;
+		rvtc->indexOid = indexrelid;
+		rvs = lappend(rvs, rvtc);
+	}
+
+	MemoryContextSwitchTo(old_context);
+	return rvs;
+}
+
+/* Cluster each relation in a separate transaction */
+static void
+cluster_multiple_rels(List *rvs, int options)
+{
+	ListCell *lc;
+
+	/* Commit to get out of starting transaction */
+	PopActiveSnapshot();
+	CommitTransactionCommand();
+
+	/* Ok, now that we've got them all, cluster them one by one */
+	foreach(lc, rvs)
+	{
+		RelToCluster *rvtc = (RelToCluster *) lfirst(lc);
+		ClusterParams cluster_params = { .options = options, };
+
+		/* Start a new transaction for each relation. */
+		StartTransactionCommand();
+
+		/* functions in indexes may want a snapshot set */
+		PushActiveSnapshot(GetTransactionSnapshot());
+
+		/* Do the job. */
+		cluster_params.options |= CLUOPT_RECHECK;
+		cluster_rel(rvtc->tableOid, rvtc->indexOid,
+					&cluster_params);
+
+		PopActiveSnapshot();
+		CommitTransactionCommand();
+	}
+}
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 39be6f556a8..9944f21f71c 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -599,6 +599,7 @@ static const SchemaQuery Query_for_list_of_clusterables = {
 	.catname = "pg_catalog.pg_class c",
 	.selcondition =
 	"c.relkind IN (" CppAsString2(RELKIND_RELATION) ", "
+	CppAsString2(RELKIND_PARTITIONED_TABLE) ", "
 	CppAsString2(RELKIND_MATVIEW) ")",
 	.viscondition = "pg_catalog.pg_table_is_visible(c.oid)",
 	.namespace = "c.relnamespace",
diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h
index 3db375d7cc7..0605b053b61 100644
--- a/src/include/commands/cluster.h
+++ b/src/include/commands/cluster.h
@@ -22,6 +22,7 @@
 /* flag bits for ClusterParams->flags */
 #define CLUOPT_RECHECK 0x01		/* recheck relation state */
 #define CLUOPT_VERBOSE 0x02		/* print progress info */
+#define CLUOPT_RECHECK_ISCLUSTERED 0x04	/* recheck relation state for indisclustered */
 
 /* options for CLUSTER */
 typedef struct ClusterParams
diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out
index e46a66952f0..3f2758d13f6 100644
--- a/src/test/regress/expected/cluster.out
+++ b/src/test/regress/expected/cluster.out
@@ -444,13 +444,52 @@ DROP TABLE clustertest;
 CREATE TABLE clustertest (f1 int PRIMARY KEY);
 CLUSTER clustertest USING clustertest_pkey;
 CLUSTER clustertest;
--- Check that partitioned tables cannot be clustered
+-- Check that partitioned tables can be clustered
 CREATE TABLE clstrpart (a int) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart1 PARTITION OF clstrpart FOR VALUES FROM (1)TO(10) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart11 PARTITION OF clstrpart1 FOR VALUES FROM (1)TO(10);
+CREATE TABLE clstrpart12 PARTITION OF clstrpart1 FOR VALUES FROM (10)TO(20) PARTITION BY RANGE(a);
+CREATE TABLE clstrpart2 PARTITION OF clstrpart FOR VALUES FROM (20)TO(30);
+CREATE TABLE clstrpart3 PARTITION OF clstrpart DEFAULT PARTITION BY RANGE(a);
+CREATE TABLE clstrpart33 PARTITION OF clstrpart3 DEFAULT;
+CREATE INDEX clstrpart_only_idx ON ONLY clstrpart (a);
+CLUSTER clstrpart USING clstrpart_only_idx; -- fails
+ERROR:  cannot cluster on invalid index "clstrpart_only_idx"
+DROP INDEX clstrpart_only_idx;
 CREATE INDEX clstrpart_idx ON clstrpart (a);
+-- Check that clustering sets new relfilenodes:
+CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
+CLUSTER clstrpart USING clstrpart_idx;
+CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
+SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C";
+   relname   | level | relkind | ?column? 
+-------------+-------+---------+----------
+ clstrpart   |     0 | p       | t
+ clstrpart1  |     1 | p       | t
+ clstrpart11 |     2 | r       | f
+ clstrpart12 |     2 | p       | t
+ clstrpart2  |     1 | r       | f
+ clstrpart3  |     1 | p       | t
+ clstrpart33 |     2 | r       | f
+(7 rows)
+
+-- Partitioned indexes aren't and can't be marked un/clustered:
+\d clstrpart
+       Partitioned table "public.clstrpart"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: RANGE (a)
+Indexes:
+    "clstrpart_idx" btree (a)
+Number of partitions: 3 (Use \d+ to list them.)
+
+CLUSTER clstrpart;
+ERROR:  there is no previously clustered index for table "clstrpart"
+ALTER TABLE clstrpart SET WITHOUT CLUSTER;
+ERROR:  cannot mark index clustered in partitioned table
 ALTER TABLE clstrpart CLUSTER ON clstrpart_idx;
 ERROR:  cannot mark index clustered in partitioned table
-CLUSTER clstrpart USING clstrpart_idx;
-ERROR:  cannot cluster a partitioned table
 DROP TABLE clstrpart;
 -- Test CLUSTER with external tuplesorting
 create table clstr_4 as select * from tenk1;
diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql
index aee9cf83e04..74118993a82 100644
--- a/src/test/regress/sql/cluster.sql
+++ b/src/test/regress/sql/cluster.sql
@@ -202,11 +202,28 @@ CREATE TABLE clustertest (f1 int PRIMARY KEY);
 CLUSTER clustertest USING clustertest_pkey;
 CLUSTER clustertest;
 
--- Check that partitioned tables cannot be clustered
+-- Check that partitioned tables can be clustered
 CREATE TABLE clstrpart (a int) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart1 PARTITION OF clstrpart FOR VALUES FROM (1)TO(10) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart11 PARTITION OF clstrpart1 FOR VALUES FROM (1)TO(10);
+CREATE TABLE clstrpart12 PARTITION OF clstrpart1 FOR VALUES FROM (10)TO(20) PARTITION BY RANGE(a);
+CREATE TABLE clstrpart2 PARTITION OF clstrpart FOR VALUES FROM (20)TO(30);
+CREATE TABLE clstrpart3 PARTITION OF clstrpart DEFAULT PARTITION BY RANGE(a);
+CREATE TABLE clstrpart33 PARTITION OF clstrpart3 DEFAULT;
+CREATE INDEX clstrpart_only_idx ON ONLY clstrpart (a);
+CLUSTER clstrpart USING clstrpart_only_idx; -- fails
+DROP INDEX clstrpart_only_idx;
 CREATE INDEX clstrpart_idx ON clstrpart (a);
-ALTER TABLE clstrpart CLUSTER ON clstrpart_idx;
+-- Check that clustering sets new relfilenodes:
+CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
 CLUSTER clstrpart USING clstrpart_idx;
+CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
+SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C";
+-- Partitioned indexes aren't and can't be marked un/clustered:
+\d clstrpart
+CLUSTER clstrpart;
+ALTER TABLE clstrpart SET WITHOUT CLUSTER;
+ALTER TABLE clstrpart CLUSTER ON clstrpart_idx;
 DROP TABLE clstrpart;
 
 -- Test CLUSTER with external tuplesorting
-- 
2.17.1

Re: CLUSTER on partitioned index

Reply via email to