Jimmy Yih <j...@vmware.com> writes:
> Tom Lane <t...@sss.pgh.pa.us> wrote:
>> Actually though, maybe you *don't* want to do this in
>> RangeVarCallbackForDropRelation.  Because of point 2, it might be
>> better to run find_all_inheritors after we've successfully
>> identified and locked the direct target relation, ie do it back
>> in RemoveRelations.  I've not thought hard about that, but it's
>> attractive if only because it'd mean you don't have to fix point 1.

> We think that RangeVarCallbackForDropRelation is probably the most
> correct function to place the fix. It would look a bit out-of-place
> being in RemoveRelations seeing how there's already relative DROP
> INDEX code in RangeVarCallbackForDropRelation.

I really think you made the wrong choice here.  Doing the locking in
RemoveRelations leads to an extremely simple patch, as I demonstrate
below.  Moreover, since RemoveRelations also has special-case code
for partitioned indexes, it's hard to argue that it mustn't cover
this case too.

Also, I think the proposed test case isn't very good, because when
I run it without applying the code patch, it fails to demonstrate
any deadlock.  The query output is different, but not obviously
wrong.

> Fixed in attached patch. Added another local variable
> is_partitioned_index to store the classform value. The main reason we
> need the classform is because the existing relkind and
> expected_relkind local variables would only show RELKIND_INDEX whereas
> we needed exactly RELKIND_PARTITIONED_INDEX.

Yeah.  As I looked at that I realized that it was totally confusing:
at least one previous author thought that "relkind" stored the rel's
actual relkind, which it doesn't as the code stands.  In particular,
in this bit:

    if ((relkind == RELKIND_INDEX || relkind == RELKIND_PARTITIONED_INDEX) &&
        relOid != oldRelOid)
    {
        state->heapOid = IndexGetRelation(relOid, true);

the test for relkind == RELKIND_PARTITIONED_INDEX is dead code
because relkind will never be that.  It accidentally works anyway
because the other half of the || does the right thing, but somebody
was confused, and so will readers be.

Hence, I propose the attached.  0001 is pure refactoring: it hopefully
clears up the confusion about which "relkind" is which, and it also
saves a couple of redundant syscache fetches in RemoveRelations.
Then 0002 adds the actual bug fix as well as a test case that does
deadlock with unpatched code.

                        regards, tom lane

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index dc5872f988..26ccd7f481 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -295,12 +295,18 @@ static const struct dropmsgstrings dropmsgstringarray[] = {
 	{'\0', 0, NULL, NULL, NULL, NULL}
 };
 
+/* communication between RemoveRelations and RangeVarCallbackForDropRelation */
 struct DropRelationCallbackState
 {
-	char		relkind;
+	/* These fields are set by RemoveRelations: */
+	char		expected_relkind;
+	bool		concurrent;
+	/* These fields are state to track which subsidiary locks are held: */
 	Oid			heapOid;
 	Oid			partParentOid;
-	bool		concurrent;
+	/* These fields are passed back by RangeVarCallbackForDropRelation: */
+	char		actual_relkind;
+	char		actual_relpersistence;
 };
 
 /* Alter table target-type flags for ATSimplePermissions */
@@ -1416,10 +1422,12 @@ RemoveRelations(DropStmt *drop)
 		AcceptInvalidationMessages();
 
 		/* Look up the appropriate relation using namespace search. */
-		state.relkind = relkind;
+		state.expected_relkind = relkind;
+		state.concurrent = drop->concurrent;
+		/* We must initialize these fields to show that no locks are held: */
 		state.heapOid = InvalidOid;
 		state.partParentOid = InvalidOid;
-		state.concurrent = drop->concurrent;
+
 		relOid = RangeVarGetRelidExtended(rel, lockmode, RVR_MISSING_OK,
 										  RangeVarCallbackForDropRelation,
 										  (void *) &state);
@@ -1433,10 +1441,10 @@ RemoveRelations(DropStmt *drop)
 
 		/*
 		 * Decide if concurrent mode needs to be used here or not.  The
-		 * relation persistence cannot be known without its OID.
+		 * callback retrieved the rel's persistence for us.
 		 */
 		if (drop->concurrent &&
-			get_rel_persistence(relOid) != RELPERSISTENCE_TEMP)
+			state.actual_relpersistence != RELPERSISTENCE_TEMP)
 		{
 			Assert(list_length(drop->objects) == 1 &&
 				   drop->removeType == OBJECT_INDEX);
@@ -1448,7 +1456,7 @@ RemoveRelations(DropStmt *drop)
 		 * either.
 		 */
 		if ((flags & PERFORM_DELETION_CONCURRENTLY) != 0 &&
-			get_rel_relkind(relOid) == RELKIND_PARTITIONED_INDEX)
+			state.actual_relkind == RELKIND_PARTITIONED_INDEX)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("cannot drop partitioned index \"%s\" concurrently",
@@ -1479,7 +1487,6 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 {
 	HeapTuple	tuple;
 	struct DropRelationCallbackState *state;
-	char		relkind;
 	char		expected_relkind;
 	bool		is_partition;
 	Form_pg_class classform;
@@ -1487,7 +1494,6 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 	bool		invalid_system_index = false;
 
 	state = (struct DropRelationCallbackState *) arg;
-	relkind = state->relkind;
 	heap_lockmode = state->concurrent ?
 		ShareUpdateExclusiveLock : AccessExclusiveLock;
 
@@ -1523,6 +1529,10 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 	classform = (Form_pg_class) GETSTRUCT(tuple);
 	is_partition = classform->relispartition;
 
+	/* Pass back some data to save lookups in RemoveRelations */
+	state->actual_relkind = classform->relkind;
+	state->actual_relpersistence = classform->relpersistence;
+
 	/*
 	 * Both RELKIND_RELATION and RELKIND_PARTITIONED_TABLE are OBJECT_TABLE,
 	 * but RemoveRelations() can only pass one relkind for a given relation.
@@ -1538,13 +1548,15 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 	else
 		expected_relkind = classform->relkind;
 
-	if (relkind != expected_relkind)
-		DropErrorMsgWrongType(rel->relname, classform->relkind, relkind);
+	if (state->expected_relkind != expected_relkind)
+		DropErrorMsgWrongType(rel->relname, classform->relkind,
+							  state->expected_relkind);
 
 	/* Allow DROP to either table owner or schema owner */
 	if (!pg_class_ownercheck(relOid, GetUserId()) &&
 		!pg_namespace_ownercheck(classform->relnamespace, GetUserId()))
-		aclcheck_error(ACLCHECK_NOT_OWNER, get_relkind_objtype(get_rel_relkind(relOid)),
+		aclcheck_error(ACLCHECK_NOT_OWNER,
+					   get_relkind_objtype(classform->relkind),
 					   rel->relname);
 
 	/*
@@ -1553,7 +1565,7 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 	 * only concerns indexes of toast relations that became invalid during a
 	 * REINDEX CONCURRENTLY process.
 	 */
-	if (IsSystemClass(relOid, classform) && relkind == RELKIND_INDEX)
+	if (IsSystemClass(relOid, classform) && classform->relkind == RELKIND_INDEX)
 	{
 		HeapTuple	locTuple;
 		Form_pg_index indexform;
@@ -1589,9 +1601,10 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 	 * locking the index.  index_drop() will need this anyway, and since
 	 * regular queries lock tables before their indexes, we risk deadlock if
 	 * we do it the other way around.  No error if we don't find a pg_index
-	 * entry, though --- the relation may have been dropped.
+	 * entry, though --- the relation may have been dropped.  Note that this
+	 * code will execute for either plain or partitioned indexes.
 	 */
-	if ((relkind == RELKIND_INDEX || relkind == RELKIND_PARTITIONED_INDEX) &&
+	if (expected_relkind == RELKIND_INDEX &&
 		relOid != oldRelOid)
 	{
 		state->heapOid = IndexGetRelation(relOid, true);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 26ccd7f481..d8faf41c94 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -1462,6 +1462,18 @@ RemoveRelations(DropStmt *drop)
 					 errmsg("cannot drop partitioned index \"%s\" concurrently",
 							rel->relname)));
 
+		/*
+		 * If we're told to drop a partitioned index, we must acquire lock on
+		 * all the children of its parent partitioned table before proceeding.
+		 * Otherwise we'd try to lock the child index partitions before their
+		 * tables, leading to potential deadlock against other sessions that
+		 * will lock those objects in the other order.
+		 */
+		if (state.actual_relkind == RELKIND_PARTITIONED_INDEX)
+			(void) find_all_inheritors(state.heapOid,
+									   AccessExclusiveLock,
+									   NULL);
+
 		/* OK, we're ready to delete this one */
 		obj.classId = RelationRelationId;
 		obj.objectId = relOid;
diff --git a/src/test/isolation/expected/partition-drop-index-locking.out b/src/test/isolation/expected/partition-drop-index-locking.out
new file mode 100644
index 0000000000..9acd51dfde
--- /dev/null
+++ b/src/test/isolation/expected/partition-drop-index-locking.out
@@ -0,0 +1,100 @@
+Parsed test spec with 3 sessions
+
+starting permutation: s1begin s1lock s2begin s2drop s1select s3getlocks s1commit s3getlocks s2commit
+step s1begin: BEGIN;
+step s1lock: LOCK TABLE part_drop_index_locking_subpart_child IN ACCESS SHARE MODE;
+step s2begin: BEGIN;
+step s2drop: DROP INDEX part_drop_index_locking_idx; <waiting ...>
+step s1select: SELECT * FROM part_drop_index_locking_subpart_child;
+id
+--
+(0 rows)
+
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                               |relname                                      |mode               |granted
+----------------------------------------------------+---------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking                      |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking_idx                  |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking_subpart              |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;             |part_drop_index_locking_subpart_child        |AccessExclusiveLock|f      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child        |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx1|AccessShareLock    |t      
+(7 rows)
+
+step s1commit: COMMIT;
+step s2drop: <... completed>
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                  |relname                                     |mode               |granted
+---------------------------------------+--------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking                     |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_idx                 |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart             |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart_child       |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart_child_id_idx|AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_idx;|part_drop_index_locking_subpart_id_idx      |AccessExclusiveLock|t      
+(6 rows)
+
+step s2commit: COMMIT;
+
+starting permutation: s1begin s1lock s2begin s2dropsub s1select s3getlocks s1commit s3getlocks s2commit
+step s1begin: BEGIN;
+step s1lock: LOCK TABLE part_drop_index_locking_subpart_child IN ACCESS SHARE MODE;
+step s2begin: BEGIN;
+step s2dropsub: DROP INDEX part_drop_index_locking_subpart_idx; <waiting ...>
+step s1select: SELECT * FROM part_drop_index_locking_subpart_child;
+id
+--
+(0 rows)
+
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                               |relname                                      |mode               |granted
+----------------------------------------------------+---------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_subpart_idx;     |part_drop_index_locking_subpart              |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;     |part_drop_index_locking_subpart_child        |AccessExclusiveLock|f      
+DROP INDEX part_drop_index_locking_subpart_idx;     |part_drop_index_locking_subpart_idx          |AccessExclusiveLock|t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child        |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx |AccessShareLock    |t      
+SELECT * FROM part_drop_index_locking_subpart_child;|part_drop_index_locking_subpart_child_id_idx1|AccessShareLock    |t      
+(6 rows)
+
+step s1commit: COMMIT;
+step s2dropsub: <... completed>
+step s3getlocks: 
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+
+query                                          |relname                                      |mode               |granted
+-----------------------------------------------+---------------------------------------------+-------------------+-------
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart              |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart_child        |AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart_child_id_idx1|AccessExclusiveLock|t      
+DROP INDEX part_drop_index_locking_subpart_idx;|part_drop_index_locking_subpart_idx          |AccessExclusiveLock|t      
+(4 rows)
+
+step s2commit: COMMIT;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 0dae483e82..8e87098150 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -90,6 +90,7 @@ test: predicate-hash
 test: predicate-gist
 test: predicate-gin
 test: partition-concurrent-attach
+test: partition-drop-index-locking
 test: partition-key-update-1
 test: partition-key-update-2
 test: partition-key-update-3
diff --git a/src/test/isolation/specs/partition-drop-index-locking.spec b/src/test/isolation/specs/partition-drop-index-locking.spec
new file mode 100644
index 0000000000..99e7f6cb64
--- /dev/null
+++ b/src/test/isolation/specs/partition-drop-index-locking.spec
@@ -0,0 +1,44 @@
+# Verify that DROP INDEX properly locks all downward sub-partitions
+# and partitions before locking the indexes.
+
+setup
+{
+  CREATE TABLE part_drop_index_locking (id int) PARTITION BY RANGE(id);
+  CREATE TABLE part_drop_index_locking_subpart PARTITION OF part_drop_index_locking FOR VALUES FROM (1) TO (100) PARTITION BY RANGE(id);
+  CREATE TABLE part_drop_index_locking_subpart_child PARTITION OF part_drop_index_locking_subpart FOR VALUES FROM (1) TO (100);
+  CREATE INDEX part_drop_index_locking_idx ON part_drop_index_locking(id);
+  CREATE INDEX part_drop_index_locking_subpart_idx ON part_drop_index_locking_subpart(id);
+}
+
+teardown
+{
+  DROP TABLE part_drop_index_locking;
+}
+
+session s1
+step s1begin    { BEGIN; }
+step s1lock     { LOCK TABLE part_drop_index_locking_subpart_child IN ACCESS SHARE MODE; }
+step s1select   { SELECT * FROM part_drop_index_locking_subpart_child; }
+step s1commit   { COMMIT; }
+
+session s2
+step s2begin    { BEGIN; }
+step s2drop     { DROP INDEX part_drop_index_locking_idx; }
+step s2dropsub  { DROP INDEX part_drop_index_locking_subpart_idx; }
+step s2commit   { COMMIT; }
+
+session s3
+step s3getlocks {
+        SELECT s.query, c.relname, l.mode, l.granted
+        FROM pg_locks l
+                JOIN pg_class c ON l.relation = c.oid
+                JOIN pg_stat_activity s ON l.pid = s.pid
+        WHERE c.relname LIKE 'part_drop_index_locking%'
+        ORDER BY s.query, c.relname, l.mode, l.granted;
+}
+
+# Run DROP INDEX on top partitioned table
+permutation s1begin s1lock s2begin s2drop(s1commit) s1select s3getlocks s1commit s3getlocks s2commit
+
+# Run DROP INDEX on top sub-partition table
+permutation s1begin s1lock s2begin s2dropsub(s1commit) s1select s3getlocks s1commit s3getlocks s2commit

Reply via email to