commit 4ebb1abad475c16877f4390c3c7ba082f3945c0e
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Sat Dec 8 04:30:54 2018 +0300

    Fix deadlock in GIN vacuum introduced by 218f51584d5
    
    Before 218f51584d5 when posting tree page is about to be deleted, then the whole
    posting tree is locked by LockBufferForCleanup() on root preventing all
    concurrent inserts.  218f51584d5 changed reduces locking to subtree containing
    page to be deleted.  However, due to concurrent parent split, inserter doesn't
    always holds pins on pages constituting path from root to the leaf page.  That
    could cause a deadlock between GIN vacuum process and GIN inserter.
    
    This commit reverts VACUUM behavior to lock the whole posting tree before
    delete any page.  However, we keep another change by 218f51584d5: the tree is
    locked only if there are pages to delete.

diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README
index cc434b1feb7..421b5b26d5b 100644
--- a/src/backend/access/gin/README
+++ b/src/backend/access/gin/README
@@ -314,17 +314,10 @@ deleted.
 The previous paragraph's reasoning only applies to searches, and only to
 posting trees. To protect from inserters following a downlink to a deleted
 page, vacuum simply locks out all concurrent insertions to the posting tree,
-by holding a super-exclusive lock on the parent page of subtree with deletable
-pages. Inserters hold a pin on the root page, but searches do not, so while
-new searches cannot begin while root page is locked, any already-in-progress
-scans can continue concurrently with vacuum in corresponding subtree of
-posting tree. To exclude interference with readers vacuum takes exclusive
-locks in a depth-first scan in left-to-right order of page tuples. Leftmost
-page is never deleted. Thus before deleting any page we obtain exclusive
-lock on any left page, effectively excluding deadlock with any reader, despite
-taking parent lock before current and left lock after current. We take left
-lock not for a concurrency reasons, but rather in need to mark page dirty.
-In the entry tree, we never delete pages.
+by holding a super-exclusive lock on the posting tree root. Inserters hold a
+pin on the root page, but searches do not, so while new searches cannot begin
+while root page is locked, any already-in-progress scans can continue
+concurrently with vacuum. In the entry tree, we never delete pages.
 
 (This is quite different from the mechanism the btree indexam uses to make
 page-deletions safe; it stamps the deleted pages with an XID and keeps the
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 3104bc12b63..bdeb0bf4f52 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -315,124 +315,114 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
 
 
 /*
- * Scan through posting tree, delete empty tuples from leaf pages.
- * Also, this function collects empty subtrees (with all empty leafs).
- * For parents of these subtrees CleanUp lock is taken, then we call
- * ScanToDelete. This is done for every inner page, which points to
- * empty subtree.
+ * Scan through posting tree leafs, delete empty tuples.  Returns true if there
+ * is at least one empty page.
  */
 static bool
-ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
+ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno)
 {
 	Buffer		buffer;
 	Page		page;
 	bool		hasVoidPage = false;
 	MemoryContext oldCxt;
 
-	buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
-								RBM_NORMAL, gvs->strategy);
-	page = BufferGetPage(buffer);
+	/* Find leftmost leaf page of posting tree and lock it in exclusive mode */
+	while (true)
+	{
+		PostingItem *pitem;
 
-	ginTraverseLock(buffer, false);
+		buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
+									RBM_NORMAL, gvs->strategy);
+		LockBuffer(buffer, GIN_SHARE);
+		page = BufferGetPage(buffer);
 
-	Assert(GinPageIsData(page));
+		Assert(GinPageIsData(page));
 
-	if (GinPageIsLeaf(page))
+		if (GinPageIsLeaf(page))
+		{
+			LockBuffer(buffer, GIN_UNLOCK);
+			LockBuffer(buffer, GIN_EXCLUSIVE);
+			break;
+		}
+
+		Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
+
+		pitem = GinDataPageGetPostingItem(page, FirstOffsetNumber);
+		blkno = PostingItemGetBlockNumber(pitem);
+		Assert(blkno != InvalidBlockNumber);
+
+		UnlockReleaseBuffer(buffer);
+	}
+
+	/* Iterate all posting tree leaves using rightlinks and vacuum them */
+	while (true)
 	{
 		oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
 		ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
 		MemoryContextSwitchTo(oldCxt);
 		MemoryContextReset(gvs->tmpCxt);
 
-		/* if root is a leaf page, we don't desire further processing */
 		if (GinDataLeafPageIsEmpty(page))
 			hasVoidPage = true;
 
-		UnlockReleaseBuffer(buffer);
-
-		return hasVoidPage;
-	}
-	else
-	{
-		OffsetNumber i;
-		bool		hasEmptyChild = false;
-		bool		hasNonEmptyChild = false;
-		OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
-		BlockNumber *children = palloc(sizeof(BlockNumber) * (maxoff + 1));
-
-		/*
-		 * Read all children BlockNumbers. Not sure it is safe if there are
-		 * many concurrent vacuums.
-		 */
-
-		for (i = FirstOffsetNumber; i <= maxoff; i++)
-		{
-			PostingItem *pitem = GinDataPageGetPostingItem(page, i);
-
-			children[i] = PostingItemGetBlockNumber(pitem);
-		}
+		blkno = GinPageGetOpaque(page)->rightlink;
 
 		UnlockReleaseBuffer(buffer);
 
-		for (i = FirstOffsetNumber; i <= maxoff; i++)
-		{
-			if (ginVacuumPostingTreeLeaves(gvs, children[i], false))
-				hasEmptyChild = true;
-			else
-				hasNonEmptyChild = true;
-		}
+		if (blkno == InvalidBlockNumber)
+			break;
 
-		pfree(children);
+		buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
+									RBM_NORMAL, gvs->strategy);
+		LockBuffer(buffer, GIN_EXCLUSIVE);
+		page = BufferGetPage(buffer);
+	}
 
-		vacuum_delay_point();
+	return hasVoidPage;
+}
 
+static void
+ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
+{
+	if (ginVacuumPostingTreeLeaves(gvs, rootBlkno))
+	{
 		/*
-		 * All subtree is empty - just return true to indicate that parent
-		 * must do a cleanup, unless we are ROOT and there is way to go upper.
+		 * There is at least one empty page.  So we have to rescan the tree
+		 * deleting empty pages.
 		 */
+		Buffer				buffer;
+		DataPageDeleteStack root,
+						   *ptr,
+						   *tmp;
 
-		if (hasEmptyChild && !hasNonEmptyChild && !isRoot)
-			return true;
+		buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, rootBlkno,
+									RBM_NORMAL, gvs->strategy);
 
-		if (hasEmptyChild)
-		{
-			DataPageDeleteStack root,
-					   *ptr,
-					   *tmp;
-
-			buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
-										RBM_NORMAL, gvs->strategy);
-			LockBufferForCleanup(buffer);
-
-			memset(&root, 0, sizeof(DataPageDeleteStack));
-			root.leftBlkno = InvalidBlockNumber;
-			root.isRoot = true;
+		/*
+		 * Lock posting tree root for cleanup to ensure there are no concurrent
+		 * inserts.
+		 */
+		LockBufferForCleanup(buffer);
 
-			ginScanToDelete(gvs, blkno, true, &root, InvalidOffsetNumber);
+		memset(&root, 0, sizeof(DataPageDeleteStack));
+		root.leftBlkno = InvalidBlockNumber;
+		root.isRoot = true;
 
-			ptr = root.child;
+		ginScanToDelete(gvs, rootBlkno, true, &root, InvalidOffsetNumber);
 
-			while (ptr)
-			{
-				tmp = ptr->child;
-				pfree(ptr);
-				ptr = tmp;
-			}
+		ptr = root.child;
 
-			UnlockReleaseBuffer(buffer);
+		while (ptr)
+		{
+			tmp = ptr->child;
+			pfree(ptr);
+			ptr = tmp;
 		}
 
-		/* Here we have deleted all empty subtrees */
-		return false;
+		UnlockReleaseBuffer(buffer);
 	}
 }
 
-static void
-ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
-{
-	ginVacuumPostingTreeLeaves(gvs, rootBlkno, true);
-}
-
 /*
  * returns modified page or NULL if page isn't modified.
  * Function works with original page until first change is occurred,
