From c520ab04cbbe705c9a2d55c0575b2a6b84b7ced5 Mon Sep 17 00:00:00 2001
From: Kirk Jamison <k.jamison@jp.fujitsu.com>
Date: Fri, 11 Sep 2020 13:00:33 +0000
Subject: [PATCH] Optimize DropRelFileNodeBuffers() during recovery.

The recovery path of DropRelFileNodeBuffers() is optimized so that
scanning of the whole buffer pool is avoided when the relation
is small enough, or the the total number of blocks to be invalidated
is below the threshold of full scanning.

While recovery, we can get a reliable cached value of nblocks for
supplied relation's fork by calling smgrcachednblocks(), and it's
safe because there are no other processes but the startup process
that changes the relation size during recovery.  Otherwise, or if
not in recovery, proceed to sequential search of the whole buffer pool.
---
 src/backend/storage/buffer/bufmgr.c | 145 +++++++++++++++++++++++++++---------
 src/backend/storage/smgr/smgr.c     |  25 ++++++-
 src/include/storage/bufmgr.h        |   2 +-
 src/include/storage/smgr.h          |   1 +
 4 files changed, 137 insertions(+), 36 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e549fa1..d44686e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -70,6 +70,8 @@
 
 #define RELS_BSEARCH_THRESHOLD		20
 
+#define BUF_DROP_FULLSCAN_THRESHOLD		(uint32)(NBuffers / 500)
+
 typedef struct PrivateRefCountEntry
 {
 	Buffer		buffer;
@@ -2965,18 +2967,28 @@ BufferGetLSNAtomic(Buffer buffer)
  *		that no other process could be trying to load more pages of the
  *		relation into buffers.
  *
- *		XXX currently it sequentially searches the buffer pool, should be
- *		changed to more clever ways of searching.  However, this routine
- *		is used only in code paths that aren't very performance-critical,
- *		and we shouldn't slow down the hot paths to make it faster ...
+ *		XXX The relation might have extended before this, so this path is
+ *		only optimized during recovery when we can get a reliable cached
+ *		value of blocks for specified relation.  See comment in
+ *		smgrnblocks() in smgr.c.  In addition, it is safe to do this since
+ *		there are no other processes but the startup process that changes
+ *		the relation size during recovery.  Otherwise, or if not in
+ *		recovery, proceed to usual invalidation process, where it
+ *		sequentially searches the buffer pool.
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
 					   int nforks, BlockNumber *firstDelBlock)
 {
 	int			i;
 	int			j;
+	RelFileNodeBackend	rnode;
+	BlockNumber	nTotalBlocks;
+	BufferDesc	*bufHdr;
+	uint32		buf_state;
+
+	rnode = smgr_reln->smgr_rnode;
 
 	/* If it's a local relation, it's localbuf.c's problem. */
 	if (RelFileNodeBackendIsTemp(rnode))
@@ -2990,44 +3002,109 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
 		return;
 	}
 
-	for (i = 0; i < NBuffers; i++)
+	/* get the total number of blocks for the supplied relation */
+	for (j = 0; j < nforks; j++)
 	{
-		BufferDesc *bufHdr = GetBufferDescriptor(i);
-		uint32		buf_state;
+		BlockNumber	nForkBlocks; /* nblocks for a relation's fork */
 
-		/*
-		 * We can make this a tad faster by prechecking the buffer tag before
-		 * we attempt to lock the buffer; this saves a lot of lock
-		 * acquisitions in typical cases.  It should be safe because the
-		 * caller must have AccessExclusiveLock on the relation, or some other
-		 * reason to be certain that no one is loading new pages of the rel
-		 * into the buffer pool.  (Otherwise we might well miss such pages
-		 * entirely.)  Therefore, while the tag might be changing while we
-		 * look at it, it can't be changing *to* a value we care about, only
-		 * *away* from such a value.  So false negatives are impossible, and
-		 * false positives are safe because we'll recheck after getting the
-		 * buffer lock.
-		 *
-		 * We could check forkNum and blockNum as well as the rnode, but the
-		 * incremental win from doing so seems small.
-		 */
-		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
-			continue;
+		 /* this returns an InvalidBlockNumber when not in recovery */
+		nForkBlocks = smgrcachednblocks(smgr_reln, forkNum[j]);
 
-		buf_state = LockBufHdr(bufHdr);
+		if (nForkBlocks == InvalidBlockNumber)
+		{
+			nTotalBlocks = InvalidBlockNumber;
+			break;
+		}
+		nTotalBlocks += nForkBlocks;
+	}
 
+	/*
+	 * Do explicit hashtable probe if the blocks to be dropped have not been
+	 * invalidated yet and the value is less than the full-scan threshold of
+	 * buffer pool.  IOW, relation is small enough for its buffers to be removed.
+	 */
+	if (nTotalBlocks != InvalidBlockNumber &&
+		nTotalBlocks < BUF_DROP_FULLSCAN_THRESHOLD)
+	{
 		for (j = 0; j < nforks; j++)
 		{
-			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
-				bufHdr->tag.forkNum == forkNum[j] &&
-				bufHdr->tag.blockNum >= firstDelBlock[j])
+			BlockNumber		curBlock;
+
+			for (curBlock = firstDelBlock[j]; curBlock < nTotalBlocks; curBlock++)
 			{
-				InvalidateBuffer(bufHdr);	/* releases spinlock */
-				break;
+				uint32		newHash;		/* hash value for newTag */
+				BufferTag	newTag;			/* identity of requested block */
+				LWLock	   	*newPartitionLock;	/* buffer partition lock for it */
+				int		buf_id;
+
+				/* create a tag so we can lookup the buffer */
+				INIT_BUFFERTAG(newTag, rnode.node, forkNum[j], curBlock);
+
+				/* determine its hash code and partition lock ID */
+				newHash = BufTableHashCode(&newTag);
+				newPartitionLock = BufMappingPartitionLock(newHash);
+
+				/* Check that it is in the buffer pool. If not, do nothing. */
+				LWLockAcquire(newPartitionLock, LW_SHARED);
+				buf_id = BufTableLookup(&newTag, newHash);
+				LWLockRelease(newPartitionLock);
+
+				if (buf_id < 0)
+					continue;
+
+				bufHdr = GetBufferDescriptor(buf_id);
+
+				buf_state = LockBufHdr(bufHdr);
+
+				if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+					bufHdr->tag.forkNum == forkNum[j] &&
+					bufHdr->tag.blockNum == curBlock)
+					InvalidateBuffer(bufHdr);	/* releases spinlock */
+				else
+					UnlockBufHdr(bufHdr, buf_state);
 			}
 		}
-		if (j >= nforks)
-			UnlockBufHdr(bufHdr, buf_state);
+	}
+	else
+	{
+		for (i= 0; i < NBuffers; i++)
+		{
+			bufHdr = GetBufferDescriptor(i);
+
+			/*
+			 * We can make this a tad faster by prechecking the buffer tag before
+			 * we attempt to lock the buffer; this saves a lot of lock
+			 * acquisitions in typical cases.  It should be safe because the
+			 * caller must have AccessExclusiveLock on the relation, or some other
+			 * reason to be certain that no one is loading new pages of the rel
+			 * into the buffer pool.  (Otherwise we might well miss such pages
+			 * entirely.)  Therefore, while the tag might be changing while we
+			 * look at it, it can't be changing *to* a value we care about, only
+			 * *away* from such a value.  So false negatives are impossible, and
+			 * false positives are safe because we'll recheck after getting the
+			 * buffer lock.
+			 *
+			 * We could check forkNum and blockNum as well as the rnode, but the
+			 * incremental win from doing so seems small.
+			 */
+			if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+				continue;
+
+			buf_state = LockBufHdr(bufHdr);
+
+			for (j = 0; j < nforks; j++)
+			{
+				if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+					bufHdr->tag.forkNum == forkNum[j] &&
+					bufHdr->tag.blockNum >= firstDelBlock[j])
+				{
+					InvalidateBuffer(bufHdr);	/* releases spinlock */
+					break;
+				}
+			}
+			if (j >= nforks)
+				UnlockBufHdr(bufHdr, buf_state);
+		}
 	}
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index dcc09df..4e45c47 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -565,6 +565,29 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
+ *	smgrcachednblocks() -- Calculate the number of blocks that are cached in
+ *					 the supplied relation.
+ *
+ * It is equivalent to calling smgrnblocks, but only used in recovery for now
+ * when DropRelFileNodeBuffers() is called.  This ensures that only cached value
+ * is used which is always valid in recovery, since there is no shared
+ * invalidation mechanism that is implemented yet for changes in file size.
+ *
+ * This returns an InvalidBlockNumber when smgr_cached_nblocks is not available
+ * and when the path is not in recovery.
+ */
+BlockNumber
+smgrcachednblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	if (InRecovery)
+	{
+		if (reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
+			return reln->smgr_cached_nblocks[forknum];
+	}
+	return InvalidBlockNumber;
+}
+
+/*
  *	smgrtruncate() -- Truncate the given forks of supplied relation to
  *					  each specified numbers of blocks
  *
@@ -583,7 +606,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
 	 * just drop them without bothering to write the contents.
 	 */
-	DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
+	DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
 
 	/*
 	 * Send a shared-inval message to force other backends to close any smgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8f..056f65e 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -203,7 +203,7 @@ extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
 extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
 extern void FlushDatabaseBuffers(Oid dbid);
-extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
 								   int nforks, BlockNumber *firstDelBlock);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index f28a842..fbb7a4a 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -99,6 +99,7 @@ extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
+extern BlockNumber smgrcachednblocks(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 						 int nforks, BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-- 
1.8.3.1

