On Sun, Dec 04, 2022 at 01:09:35PM -0600, Justin Pryzby wrote: > This currently handles partitions with a loop around the whole CIC > implementation, which means that things like WaitForLockers() happen > once for each index, the same as REINDEX CONCURRENTLY on a partitioned > table. Contrast that with ReindexRelationConcurrently(), which handles > all the indexes on a table in one pass by looping around indexes within > each phase.
Rebased over the progress reporting fix (27f5c712b). I added a list of (intermediate) partitioned tables, rather than looping over the list of inheritors again, to save calling rel_get_relkind(). I think this patch is done. -- Justin
>From 941f7f930fc18563e2da42143015b6573d5447b1 Mon Sep 17 00:00:00 2001 From: Justin Pryzby <pryz...@telsasoft.com> Date: Sat, 6 Jun 2020 17:42:23 -0500 Subject: [PATCH] Allow CREATE INDEX CONCURRENTLY on partitioned table https://www.postgresql.org/message-id/flat/20201031063117.gf3...@telsasoft.com --- doc/src/sgml/ddl.sgml | 4 +- doc/src/sgml/ref/create_index.sgml | 14 +- src/backend/commands/indexcmds.c | 200 ++++++++++++++++++------- src/test/regress/expected/indexing.out | 127 +++++++++++++++- src/test/regress/sql/indexing.sql | 26 +++- 5 files changed, 297 insertions(+), 74 deletions(-) diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml index 91c036d1cbe..64efdf1e879 100644 --- a/doc/src/sgml/ddl.sgml +++ b/doc/src/sgml/ddl.sgml @@ -4178,9 +4178,7 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02 so that they are applied automatically to the entire hierarchy. This is very convenient, as not only will the existing partitions become indexed, but - also any partitions that are created in the future will. One limitation is - that it's not possible to use the <literal>CONCURRENTLY</literal> - qualifier when creating such a partitioned index. To avoid long lock + also any partitions that are created in the future will. To avoid long lock times, it is possible to use <command>CREATE INDEX ON ONLY</command> the partitioned table; such an index is marked invalid, and the partitions do not get the index applied automatically. The indexes on partitions can diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 40986aa502f..b05102efdaf 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -645,7 +645,10 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class= <para> If a problem arises while scanning the table, such as a deadlock or a uniqueness violation in a unique index, the <command>CREATE INDEX</command> - command will fail but leave behind an <quote>invalid</quote> index. This index + command will fail but leave behind an <quote>invalid</quote> index. + If this happens while build an index concurrently on a partitioned + table, the command can also leave behind <quote>valid</quote> or + <quote>invalid</quote> indexes on table partitions. The invalid index will be ignored for querying purposes because it might be incomplete; however it will still consume update overhead. The <application>psql</application> <command>\d</command> command will report such an index as <literal>INVALID</literal>: @@ -692,15 +695,6 @@ Indexes: cannot. </para> - <para> - Concurrent builds for indexes on partitioned tables are currently not - supported. However, you may concurrently build the index on each - partition individually and then finally create the partitioned index - non-concurrently in order to reduce the time where writes to the - partitioned table will be locked out. In this case, building the - partitioned index is a metadata only operation. - </para> - </refsect2> </refsect1> diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 3ec8b5cca6c..daba8b67dbe 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -93,6 +93,11 @@ static char *ChooseIndexName(const char *tabname, Oid namespaceId, bool primary, bool isconstraint); static char *ChooseIndexNameAddition(List *colnames); static List *ChooseIndexColumnNames(List *indexElems); +static void DefineIndexConcurrentInternal(Oid relationId, + Oid indexRelationId, + IndexInfo *indexInfo, + LOCKTAG heaplocktag, + LockRelId heaprelid); static void ReindexIndex(RangeVar *indexRelation, ReindexParams *params, bool isTopLevel); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, @@ -559,7 +564,6 @@ DefineIndex(Oid relationId, bool amissummarizing; amoptions_function amoptions; bool partitioned; - bool safe_index; Datum reloptions; int16 *coloptions; IndexInfo *indexInfo; @@ -567,12 +571,10 @@ DefineIndex(Oid relationId, bits16 constr_flags; int numberOfAttributes; int numberOfKeyAttributes; - TransactionId limitXmin; ObjectAddress address; LockRelId heaprelid; LOCKTAG heaplocktag; LOCKMODE lockmode; - Snapshot snapshot; Oid root_save_userid; int root_save_sec_context; int root_save_nestlevel; @@ -705,17 +707,6 @@ DefineIndex(Oid relationId, partitioned = rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE; if (partitioned) { - /* - * Note: we check 'stmt->concurrent' rather than 'concurrent', so that - * the error is thrown also for temporary tables. Seems better to be - * consistent, even though we could do it on temporary table because - * we're not actually doing it concurrently. - */ - if (stmt->concurrent) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot create index on partitioned table \"%s\" concurrently", - RelationGetRelationName(rel)))); if (stmt->excludeOpNames) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), @@ -1089,10 +1080,6 @@ DefineIndex(Oid relationId, } } - /* Is index safe for others to ignore? See set_indexsafe_procflags() */ - safe_index = indexInfo->ii_Expressions == NIL && - indexInfo->ii_Predicate == NIL; - /* * Report index creation if appropriate (delay this till after most of the * error checks) @@ -1157,6 +1144,11 @@ DefineIndex(Oid relationId, if (pd->nparts != 0) flags |= INDEX_CREATE_INVALID; } + else if (concurrent && OidIsValid(parentIndexId)) + { + /* If concurrent, initially build index partitions as "invalid" */ + flags |= INDEX_CREATE_INVALID; + } if (stmt->deferrable) constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE; @@ -1494,58 +1486,54 @@ DefineIndex(Oid relationId, * invalid, this is incorrect, so update our row to invalid too. */ if (invalidate_parent) - { - Relation pg_index = table_open(IndexRelationId, RowExclusiveLock); - HeapTuple tup, - newtup; - - tup = SearchSysCache1(INDEXRELID, - ObjectIdGetDatum(indexRelationId)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for index %u", - indexRelationId); - newtup = heap_copytuple(tup); - ((Form_pg_index) GETSTRUCT(newtup))->indisvalid = false; - CatalogTupleUpdate(pg_index, &tup->t_self, newtup); - ReleaseSysCache(tup); - table_close(pg_index, RowExclusiveLock); - heap_freetuple(newtup); - } + index_set_state_flags(indexRelationId, INDEX_DROP_CLEAR_VALID); } /* * Indexes on partitioned tables are not themselves built, so we're - * done here. + * done here in the non-concurrent case. */ - AtEOXact_GUC(false, root_save_nestlevel); - SetUserIdAndSecContext(root_save_userid, root_save_sec_context); - table_close(rel, NoLock); - if (!OidIsValid(parentIndexId)) - pgstat_progress_end_command(); - else + if (!concurrent) { - /* Update progress for an intermediate partitioned index itself */ - pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); - } + AtEOXact_GUC(false, root_save_nestlevel); + SetUserIdAndSecContext(root_save_userid, root_save_sec_context); + table_close(rel, NoLock); - return address; + if (!OidIsValid(parentIndexId)) + pgstat_progress_end_command(); + else + { + /* + * Update progress for an intermediate partitioned index + * itself + */ + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + } + + return address; + } } AtEOXact_GUC(false, root_save_nestlevel); SetUserIdAndSecContext(root_save_userid, root_save_sec_context); - if (!concurrent) + /* + * All done in the non-concurrent case, and when building catalog entries + * of partitions for CIC. + */ + if (!concurrent || OidIsValid(parentIndexId)) { - /* Close the heap and we're done, in the non-concurrent case */ table_close(rel, NoLock); /* * If this is the top-level index, the command is done overall; - * otherwise, increment progress to report one child index is done. + * otherwise (when being called recursively), increment progress to + * report that one child index is done. Except in the concurrent + * (catalog-only) case, which is handled later. */ if (!OidIsValid(parentIndexId)) pgstat_progress_end_command(); - else + else if (!concurrent) pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); return address; @@ -1556,6 +1544,114 @@ DefineIndex(Oid relationId, SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); table_close(rel, NoLock); + if (!partitioned) + { + /* CREATE INDEX CONCURRENTLY on a nonpartitioned table */ + DefineIndexConcurrentInternal(relationId, indexRelationId, + indexInfo, heaplocktag, heaprelid); + pgstat_progress_end_command(); + return address; + } + else + { + /* + * For CIC on a partitioned table, finish by building indexes on + * partitions + */ + + ListCell *lc; + List *childs; + List *partitioned = NIL; + MemoryContext cic_context, + old_context; + + /* Create special memory context for cross-transaction storage */ + cic_context = AllocSetContextCreate(PortalContext, + "Create index concurrently", + ALLOCSET_DEFAULT_SIZES); + + old_context = MemoryContextSwitchTo(cic_context); + childs = find_all_inheritors(indexRelationId, ShareLock, NULL); + MemoryContextSwitchTo(old_context); + + foreach(lc, childs) + { + Oid indrelid = lfirst_oid(lc); + Oid tabrelid; + char relkind; + + /* + * Pre-existing partitions which were ATTACHED were already + * counted in the progress report. + */ + if (get_index_isvalid(indrelid)) + continue; + + /* + * Partitioned indexes are counted in the progress report, but + * don't need to be further processed. + */ + relkind = get_rel_relkind(indrelid); + if (!RELKIND_HAS_STORAGE(relkind)) + { + /* The toplevel index doesn't count towards "partitions done" */ + if (indrelid != indexRelationId) + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + + /* + * Build up a list of all the intermediate partitioned tables + * which will later need to be set valid. + */ + old_context = MemoryContextSwitchTo(cic_context); + partitioned = lappend_oid(partitioned, indrelid); + MemoryContextSwitchTo(old_context); + continue; + } + + rel = table_open(relationId, ShareUpdateExclusiveLock); + heaprelid = rel->rd_lockInfo.lockRelId; + table_close(rel, ShareUpdateExclusiveLock); + SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); + + /* Process each partition in a separate transaction */ + tabrelid = IndexGetRelation(indrelid, false); + DefineIndexConcurrentInternal(tabrelid, indrelid, indexInfo, + heaplocktag, heaprelid); + + PushActiveSnapshot(GetTransactionSnapshot()); + pgstat_progress_incr_param(PROGRESS_CREATEIDX_PARTITIONS_DONE, 1); + } + + /* Set as valid all partitioned indexes, including the parent */ + foreach(lc, partitioned) + { + Oid indrelid = lfirst_oid(lc); + + index_set_state_flags(indrelid, INDEX_CREATE_SET_READY); + CommandCounterIncrement(); + index_set_state_flags(indrelid, INDEX_CREATE_SET_VALID); + } + + MemoryContextDelete(cic_context); + pgstat_progress_end_command(); + PopActiveSnapshot(); + return address; + } +} + + +static void +DefineIndexConcurrentInternal(Oid relationId, + Oid indexRelationId, IndexInfo *indexInfo, + LOCKTAG heaplocktag, LockRelId heaprelid) +{ + TransactionId limitXmin; + Snapshot snapshot; + + /* Is index safe for others to ignore? See set_indexsafe_procflags() */ + bool safe_index = indexInfo->ii_Expressions == NIL && + indexInfo->ii_Predicate == NIL; + /* * For a concurrent build, it's important to make the catalog entries * visible to other transactions before we start to build the index. That @@ -1751,10 +1847,6 @@ DefineIndex(Oid relationId, * Last thing to do is release the session-level lock on the parent table. */ UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock); - - pgstat_progress_end_command(); - - return address; } diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index 1bdd430f063..f1beee6d240 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -50,11 +50,130 @@ select relname, relkind, relhassubclass, inhparent::regclass (8 rows) drop table idxpart; --- Some unsupported features +-- CIC on partitioned table create table idxpart (a int, b int, c text) partition by range (a); -create table idxpart1 partition of idxpart for values from (0) to (10); -create index concurrently on idxpart (a); -ERROR: cannot create index on partitioned table "idxpart" concurrently +create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a); +create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a); +create table idxpart111 partition of idxpart11 default partition by range(a); +create table idxpart1111 partition of idxpart111 default partition by range(a); +create table idxpart2 partition of idxpart for values from (10) to (20); +create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a); +create table idxpart31 partition of idxpart3 default; +insert into idxpart2 values(10),(10); -- not unique +create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves +create index concurrently on idxpart1 (a); -- partitioned and partition +create index concurrently on idxpart2 (a); -- leaf +create index concurrently on idxpart (a); -- partitioned +create unique index concurrently on idxpart (a); -- partitioned, unique failure +ERROR: could not create unique index "idxpart2_a_idx1" +DETAIL: Key (a)=(10) is duplicated. +\d idxpart + Partitioned table "public.idxpart" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition key: RANGE (a) +Indexes: + "idxpart_a_idx" btree (a) + "idxpart_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 3 (Use \d+ to list them.) + +\d idxpart1 + Partitioned table "public.idxpart1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (0) TO (10) +Partition key: RANGE (a) +Indexes: + "idxpart1_a_idx" btree (a) + "idxpart1_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart11 + Partitioned table "public.idxpart11" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart1 FOR VALUES FROM (0) TO (10) +Partition key: RANGE (a) +Indexes: + "idxpart11_a_idx" btree (a) + "idxpart11_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart111 + Partitioned table "public.idxpart111" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart11 DEFAULT +Partition key: RANGE (a) +Indexes: + "idxpart111_a_idx" btree (a) + "idxpart111_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart1111 + Partitioned table "public.idxpart1111" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart111 DEFAULT +Partition key: RANGE (a) +Indexes: + "idxpart1111_a_idx" btree (a) + "idxpart1111_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 0 + +\d idxpart2 + Table "public.idxpart2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (10) TO (20) +Indexes: + "idxpart2_a_idx" btree (a) + "idxpart2_a_idx1" UNIQUE, btree (a) INVALID + +\d idxpart3 + Partitioned table "public.idxpart3" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart FOR VALUES FROM (30) TO (40) +Partition key: RANGE (a) +Indexes: + "idxpart3_a_idx" btree (a) + "idxpart3_a_idx1" UNIQUE, btree (a) INVALID +Number of partitions: 1 (Use \d+ to list them.) + +\d idxpart31 + Table "public.idxpart31" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | +Partition of: idxpart3 DEFAULT +Indexes: + "idxpart31_a_idx" btree (a) + "idxpart31_a_idx1" UNIQUE, btree (a) INVALID + drop table idxpart; -- Verify bugfix with query on indexed partitioned table with no partitions -- https://postgr.es/m/20180124162006.pmapfiznhgngwtjf@alvherre.pgsql diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql index 429120e7104..fb0baedcc28 100644 --- a/src/test/regress/sql/indexing.sql +++ b/src/test/regress/sql/indexing.sql @@ -29,10 +29,30 @@ select relname, relkind, relhassubclass, inhparent::regclass where relname like 'idxpart%' order by relname; drop table idxpart; --- Some unsupported features +-- CIC on partitioned table create table idxpart (a int, b int, c text) partition by range (a); -create table idxpart1 partition of idxpart for values from (0) to (10); -create index concurrently on idxpart (a); +create table idxpart1 partition of idxpart for values from (0) to (10) partition by range(a); +create table idxpart11 partition of idxpart1 for values from (0) to (10) partition by range(a); +create table idxpart111 partition of idxpart11 default partition by range(a); +create table idxpart1111 partition of idxpart111 default partition by range(a); +create table idxpart2 partition of idxpart for values from (10) to (20); +create table idxpart3 partition of idxpart for values from (30) to (40) partition by range(a); +create table idxpart31 partition of idxpart3 default; + +insert into idxpart2 values(10),(10); -- not unique +create index concurrently on idxpart11 (a); -- partitioned and partition, with no leaves +create index concurrently on idxpart1 (a); -- partitioned and partition +create index concurrently on idxpart2 (a); -- leaf +create index concurrently on idxpart (a); -- partitioned +create unique index concurrently on idxpart (a); -- partitioned, unique failure +\d idxpart +\d idxpart1 +\d idxpart11 +\d idxpart111 +\d idxpart1111 +\d idxpart2 +\d idxpart3 +\d idxpart31 drop table idxpart; -- Verify bugfix with query on indexed partitioned table with no partitions -- 2.34.1