Hi.

I've rebased patch on master and it'seems to me there's one more issue -

when we call DefineIndexConcurrentInternal() in partitioned case, it waits for transactions, locking tableId, not tabrelid - heaprelid LockRelId is constructed for parent index relation, not for child index relation.

Attaching fixed version.

Also I'm not sure what to do with locking of child relations. If we don't do anything, you can drop one of the partitioned table childs while CIC is in progress, and get error

ERROR:  cache lookup failed for index 16399

If you try to lock all child tables in CIC session, you'll get deadlocks.

--
Best regards,
Alexander Pyhalov,
Postgres Professional
From 37a13b7fa1c3277b9d038b7a0c75399ff05b28a7 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pry...@telsasoft.com>
Date: Mon, 29 Jan 2024 10:41:01 +0300
Subject: [PATCH] Allow CREATE INDEX CONCURRENTLY on partitioned table

---
 doc/src/sgml/ddl.sgml                  |   4 +-
 doc/src/sgml/ref/create_index.sgml     |  14 +-
 src/backend/commands/indexcmds.c       | 200 ++++++++++++++++++-------
 src/test/regress/expected/indexing.out | 127 +++++++++++++++-
 src/test/regress/sql/indexing.sql      |  26 +++-
 5 files changed, 296 insertions(+), 75 deletions(-)

diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index 075ff329912..8ee80c40e3b 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -4194,9 +4194,7 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02
      so that they are applied automatically to the entire hierarchy.
      This is very
      convenient, as not only will the existing partitions become indexed, but
-     also any partitions that are created in the future will.  One limitation is
-     that it's not possible to use the <literal>CONCURRENTLY</literal>
-     qualifier when creating such a partitioned index.  To avoid long lock
+     also any partitions that are created in the future will.  To avoid long lock
      times, it is possible to use <command>CREATE INDEX ON ONLY</command>
      the partitioned table; such an index is marked invalid, and the partitions
      do not get the index applied automatically.  The indexes on partitions can
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index 40986aa502f..b05102efdaf 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -645,7 +645,10 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
    <para>
     If a problem arises while scanning the table, such as a deadlock or a
     uniqueness violation in a unique index, the <command>CREATE INDEX</command>
-    command will fail but leave behind an <quote>invalid</quote> index. This index
+    command will fail but leave behind an <quote>invalid</quote> index.
+    If this happens while build an index concurrently on a partitioned
+    table, the command can also leave behind <quote>valid</quote> or
+    <quote>invalid</quote> indexes on table partitions.  The invalid index
     will be ignored for querying purposes because it might be incomplete;
     however it will still consume update overhead. The <application>psql</application>
     <command>\d</command> command will report such an index as <literal>INVALID</literal>:
@@ -692,15 +695,6 @@ Indexes:
     cannot.
    </para>
 
-   <para>
-    Concurrent builds for indexes on partitioned tables are currently not
-    supported.  However, you may concurrently build the index on each
-    partition individually and then finally create the partitioned index
-    non-concurrently in order to reduce the time where writes to the
-    partitioned table will be locked out.  In this case, building the
-    partitioned index is a metadata only operation.
-   </para>
-
   </refsect2>
  </refsect1>
 
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index ab8b81b3020..65477aeb3a8 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -93,6 +93,11 @@ static char *ChooseIndexName(const char *tabname, Oid namespaceId,
 							 bool primary, bool isconstraint);
 static char *ChooseIndexNameAddition(const List *colnames);
 static List *ChooseIndexColumnNames(const List *indexElems);
+static void DefineIndexConcurrentInternal(Oid relationId,
+							  Oid indexRelationId,
+							  IndexInfo *indexInfo,
+							  LOCKTAG heaplocktag,
+							  LockRelId heaprelid);
 static void ReindexIndex(const RangeVar *indexRelation, const ReindexParams *params,
 						 bool isTopLevel);
 static void RangeVarCallbackForReindexIndex(const RangeVar *relation,
@@ -554,7 +559,6 @@ DefineIndex(Oid tableId,
 	bool		amissummarizing;
 	amoptions_function amoptions;
 	bool		partitioned;
-	bool		safe_index;
 	Datum		reloptions;
 	int16	   *coloptions;
 	IndexInfo  *indexInfo;
@@ -562,12 +566,10 @@ DefineIndex(Oid tableId,
 	bits16		constr_flags;
 	int			numberOfAttributes;
 	int			numberOfKeyAttributes;
-	TransactionId limitXmin;
 	ObjectAddress address;
 	LockRelId	heaprelid;
 	LOCKTAG		heaplocktag;
 	LOCKMODE	lockmode;
-	Snapshot	snapshot;
 	Oid			root_save_userid;
 	int			root_save_sec_context;
 	int			root_save_nestlevel;
@@ -697,20 +699,6 @@ DefineIndex(Oid tableId,
 	 * partition.
 	 */
 	partitioned = rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE;
-	if (partitioned)
-	{
-		/*
-		 * Note: we check 'stmt->concurrent' rather than 'concurrent', so that
-		 * the error is thrown also for temporary tables.  Seems better to be
-		 * consistent, even though we could do it on temporary table because
-		 * we're not actually doing it concurrently.
-		 */
-		if (stmt->concurrent)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("cannot create index on partitioned table \"%s\" concurrently",
-							RelationGetRelationName(rel))));
-	}
 
 	/*
 	 * Don't try to CREATE INDEX on temp tables of other backends.
@@ -1100,10 +1088,6 @@ DefineIndex(Oid tableId,
 		}
 	}
 
-	/* Is index safe for others to ignore?  See set_indexsafe_procflags() */
-	safe_index = indexInfo->ii_Expressions == NIL &&
-		indexInfo->ii_Predicate == NIL;
-
 	/*
 	 * Report index creation if appropriate (delay this till after most of the
 	 * error checks)
@@ -1168,6 +1152,11 @@ DefineIndex(Oid tableId,
 		if (pd->nparts != 0)
 			flags |= INDEX_CREATE_INVALID;
 	}
+	else if (concurrent && OidIsValid(parentIndexId))
+	{
+		/* If concurrent, initially build index partitions as "invalid" */
+		flags |= INDEX_CREATE_INVALID;
+	}
 
 	if (stmt->deferrable)
 		constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE;
@@ -1516,21 +1505,7 @@ DefineIndex(Oid tableId,
 			 */
 			if (invalidate_parent)
 			{
-				Relation	pg_index = table_open(IndexRelationId, RowExclusiveLock);
-				HeapTuple	tup,
-							newtup;
-
-				tup = SearchSysCache1(INDEXRELID,
-									  ObjectIdGetDatum(indexRelationId));
-				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for index %u",
-						 indexRelationId);
-				newtup = heap_copytuple(tup);
-				((Form_pg_index) GETSTRUCT(newtup))->indisvalid = false;
-				CatalogTupleUpdate(pg_index, &tup->t_self, newtup);
-				ReleaseSysCache(tup);
-				table_close(pg_index, RowExclusiveLock);
-				heap_freetuple(newtup);
+				index_set_state_flags(indexRelationId, INDEX_DROP_CLEAR_VALID);
 
 				/*
 				 * CCI here to make this update visible, in case this recurses
@@ -1542,37 +1517,49 @@ DefineIndex(Oid tableId,
 
 		/*
 		 * Indexes on partitioned tables are not themselves built, so we're
-		 * done here.
+		 * done here in the non-concurrent case.
 		 */
-		AtEOXact_GUC(false, root_save_nestlevel);
-		SetUserIdAndSecContext(root_save_userid, root_save_sec_context);
-		table_close(rel, NoLock);
-		if (!OidIsValid(parentIndexId))
-			pgstat_progress_end_command();
-		else
+		if (!concurrent)
 		{
-			/* Update progress for an intermediate partitioned index itself */
-			pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1);
-		}
+			AtEOXact_GUC(false, root_save_nestlevel);
+			SetUserIdAndSecContext(root_save_userid, root_save_sec_context);
+			table_close(rel, NoLock);
 
-		return address;
+			if (!OidIsValid(parentIndexId))
+				pgstat_progress_end_command();
+			else
+			{
+				/*
+				 * Update progress for an intermediate partitioned index
+				 * itself
+				 */
+				pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1);
+			}
+
+			return address;
+		}
 	}
 
 	AtEOXact_GUC(false, root_save_nestlevel);
 	SetUserIdAndSecContext(root_save_userid, root_save_sec_context);
 
-	if (!concurrent)
+	/*
+	 * All done in the non-concurrent case, and when building catalog entries
+	 * of partitions for CIC.
+	 */
+	if (!concurrent || OidIsValid(parentIndexId))
 	{
-		/* Close the heap and we're done, in the non-concurrent case */
 		table_close(rel, NoLock);
 
 		/*
 		 * If this is the top-level index, the command is done overall;
-		 * otherwise, increment progress to report one child index is done.
+		 * otherwise (when being called recursively), increment progress to
+		 * report that one child index is done.  Except in the concurrent
+		 * (catalog-only) case, which is handled later.
 		 */
 		if (!OidIsValid(parentIndexId))
 			pgstat_progress_end_command();
-		else
+		else if (!concurrent)
 			pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1);
 
 		return address;
@@ -1583,6 +1570,113 @@ DefineIndex(Oid tableId,
 	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
 	table_close(rel, NoLock);
 
+	if (!partitioned)
+	{
+		/* CREATE INDEX CONCURRENTLY on a nonpartitioned table */
+		DefineIndexConcurrentInternal(tableId, indexRelationId,
+									  indexInfo, heaplocktag, heaprelid);
+		pgstat_progress_end_command();
+		return address;
+	}
+	else
+	{
+		/*
+		 * For CIC on a partitioned table, finish by building indexes on
+		 * partitions
+		 */
+
+		ListCell   *lc;
+		List	   *childs;
+		List	   *tosetvalid = NIL;
+		MemoryContext cic_context,
+					old_context;
+
+		/* Create special memory context for cross-transaction storage */
+		cic_context = AllocSetContextCreate(PortalContext,
+											"Create index concurrently",
+											ALLOCSET_DEFAULT_SIZES);
+
+		old_context = MemoryContextSwitchTo(cic_context);
+		childs = find_all_inheritors(indexRelationId, ShareLock, NULL);
+		MemoryContextSwitchTo(old_context);
+
+		foreach(lc, childs)
+		{
+			Oid			indrelid = lfirst_oid(lc);
+			Oid			tabrelid;
+			char		relkind;
+
+			/*
+			 * Pre-existing partitions which were ATTACHED were already
+			 * counted in the progress report.
+			 */
+			if (get_index_isvalid(indrelid))
+				continue;
+
+			/*
+			 * Partitioned indexes are counted in the progress report, but
+			 * don't need to be further processed.
+			 */
+			relkind = get_rel_relkind(indrelid);
+			if (!RELKIND_HAS_STORAGE(relkind))
+			{
+				/* The toplevel index doesn't count towards "partitions done" */
+				if (indrelid != indexRelationId)
+					pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1);
+
+				/*
+				 * Build up a list of all the intermediate partitioned tables
+				 * which will later need to be set valid.
+				 */
+				old_context = MemoryContextSwitchTo(cic_context);
+				tosetvalid = lappend_oid(tosetvalid, indrelid);
+				MemoryContextSwitchTo(old_context);
+				continue;
+			}
+
+			tabrelid = IndexGetRelation(indrelid, false);
+			rel = table_open(tabrelid, ShareUpdateExclusiveLock);
+			heaprelid = rel->rd_lockInfo.lockRelId;
+			table_close(rel, ShareUpdateExclusiveLock);
+			SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
+
+			/* Process each partition in a separate transaction */
+			DefineIndexConcurrentInternal(tabrelid, indrelid, indexInfo,
+										  heaplocktag, heaprelid);
+
+			PushActiveSnapshot(GetTransactionSnapshot());
+			pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1);
+		}
+
+		/* Set as valid all partitioned indexes, including the parent */
+		foreach(lc, tosetvalid)
+		{
+			Oid			indrelid = lfirst_oid(lc);
+
+			index_set_state_flags(indrelid, INDEX_CREATE_SET_READY);
+			CommandCounterIncrement();
+			index_set_state_flags(indrelid, INDEX_CREATE_SET_VALID);
+		}
+
+		MemoryContextDelete(cic_context);
+		pgstat_progress_end_command();
+		PopActiveSnapshot();
+		return address;
+	}
+}
+
+
+static void
+DefineIndexConcurrentInternal(Oid tableId, Oid indexRelationId, IndexInfo *indexInfo,
+							  LOCKTAG heaplocktag, LockRelId heaprelid)
+{
+	TransactionId limitXmin;
+	Snapshot	snapshot;
+
+	/* Is index safe for others to ignore?  See set_indexsafe_procflags() */
+	bool		safe_index = indexInfo->ii_Expressions == NIL &&
+		indexInfo->ii_Predicate == NIL;
+
 	/*
 	 * For a concurrent build, it's important to make the catalog entries
 	 * visible to other transactions before we start to build the index. That
@@ -1778,10 +1872,6 @@ DefineIndex(Oid tableId,
 	 * Last thing to do is release the session-level lock on the parent table.
 	 */
 	UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock);
-
-	pgstat_progress_end_command();
-
-	return address;
 }
 
 
diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out
index 087f955b1e6..5d529454993 100644
--- a/src/test/regress/expected/indexing.out
+++ b/src/test/regress/expected/indexing.out
@@ -50,11 +50,130 @@ select relname, relkind, relhassubclass, inhparent::regclass
 (8 rows)
 
 drop table idxpart;
--- Some unsupported features
+-- CIC on partitioned table
 create table idxpart (a int, b int, c text) partition by range (a);
-create table idxpart1 partition of idxpart for values from (0) to (10);
-create index concurrently on idxpart (a);
-ERROR:  cannot create index on partitioned table "idxpart" concurrently
+create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a);
+create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a);
+create table idxpart111 partition of idxpart11 default partition by range(a);
+create table idxpart1111 partition of idxpart111 default partition by range(a);
+create table idxpart2 partition of idxpart for values from (10) to (20);
+create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a);
+create table idxpart31 partition of idxpart3 default;
+insert into idxpart2 values(10),(10); -- not unique
+create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves
+create index concurrently on idxpart1 (a); -- partitioned and partition
+create index concurrently on idxpart2 (a); -- leaf
+create index concurrently on idxpart (a); -- partitioned
+create unique index concurrently on idxpart (a); -- partitioned, unique failure
+ERROR:  could not create unique index "idxpart2_a_idx1"
+DETAIL:  Key (a)=(10) is duplicated.
+\d idxpart
+        Partitioned table "public.idxpart"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition key: RANGE (a)
+Indexes:
+    "idxpart_a_idx" btree (a)
+    "idxpart_a_idx1" UNIQUE, btree (a) INVALID
+Number of partitions: 3 (Use \d+ to list them.)
+
+\d idxpart1
+        Partitioned table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart FOR VALUES FROM (0) TO (10)
+Partition key: RANGE (a)
+Indexes:
+    "idxpart1_a_idx" btree (a)
+    "idxpart1_a_idx1" UNIQUE, btree (a) INVALID
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d idxpart11
+       Partitioned table "public.idxpart11"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart1 FOR VALUES FROM (0) TO (10)
+Partition key: RANGE (a)
+Indexes:
+    "idxpart11_a_idx" btree (a)
+    "idxpart11_a_idx1" UNIQUE, btree (a) INVALID
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d idxpart111
+       Partitioned table "public.idxpart111"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart11 DEFAULT
+Partition key: RANGE (a)
+Indexes:
+    "idxpart111_a_idx" btree (a)
+    "idxpart111_a_idx1" UNIQUE, btree (a) INVALID
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d idxpart1111
+      Partitioned table "public.idxpart1111"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart111 DEFAULT
+Partition key: RANGE (a)
+Indexes:
+    "idxpart1111_a_idx" btree (a)
+    "idxpart1111_a_idx1" UNIQUE, btree (a) INVALID
+Number of partitions: 0
+
+\d idxpart2
+              Table "public.idxpart2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart FOR VALUES FROM (10) TO (20)
+Indexes:
+    "idxpart2_a_idx" btree (a)
+    "idxpart2_a_idx1" UNIQUE, btree (a) INVALID
+
+\d idxpart3
+        Partitioned table "public.idxpart3"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart FOR VALUES FROM (30) TO (40)
+Partition key: RANGE (a)
+Indexes:
+    "idxpart3_a_idx" btree (a)
+    "idxpart3_a_idx1" UNIQUE, btree (a) INVALID
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d idxpart31
+             Table "public.idxpart31"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart3 DEFAULT
+Indexes:
+    "idxpart31_a_idx" btree (a)
+    "idxpart31_a_idx1" UNIQUE, btree (a) INVALID
+
 drop table idxpart;
 -- Verify bugfix with query on indexed partitioned table with no partitions
 -- https://postgr.es/m/20180124162006.pmapfiznhgngwtjf@alvherre.pgsql
diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql
index 44f6788915c..96c63615e9a 100644
--- a/src/test/regress/sql/indexing.sql
+++ b/src/test/regress/sql/indexing.sql
@@ -29,10 +29,30 @@ select relname, relkind, relhassubclass, inhparent::regclass
 	where relname like 'idxpart%' order by relname;
 drop table idxpart;
 
--- Some unsupported features
+-- CIC on partitioned table
 create table idxpart (a int, b int, c text) partition by range (a);
-create table idxpart1 partition of idxpart for values from (0) to (10);
-create index concurrently on idxpart (a);
+create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a);
+create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a);
+create table idxpart111 partition of idxpart11 default partition by range(a);
+create table idxpart1111 partition of idxpart111 default partition by range(a);
+create table idxpart2 partition of idxpart for values from (10) to (20);
+create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a);
+create table idxpart31 partition of idxpart3 default;
+
+insert into idxpart2 values(10),(10); -- not unique
+create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves
+create index concurrently on idxpart1 (a); -- partitioned and partition
+create index concurrently on idxpart2 (a); -- leaf
+create index concurrently on idxpart (a); -- partitioned
+create unique index concurrently on idxpart (a); -- partitioned, unique failure
+\d idxpart
+\d idxpart1
+\d idxpart11
+\d idxpart111
+\d idxpart1111
+\d idxpart2
+\d idxpart3
+\d idxpart31
 drop table idxpart;
 
 -- Verify bugfix with query on indexed partitioned table with no partitions
-- 
2.34.1

Reply via email to