I (re)discovered why I used the lock-then-pin approach.  In the
comments I mentioned InvalidBuffer(), but the main problem is in its
caller GetVictimBuffer() which has various sanity checks about
reference counts that can occasionally fail if you have code randomly
pinning any old buffer.

New idea: use the standard PinBuffer() function, but add a mode that
doesn't pin invalid buffers (with caveat that you can perhaps get a
false negative due to unlocked read, but never a false positive; see
commit message).  Otherwise we'd have to duplicate all the same logic
to use cmpxchg for ReadRecentBuffer(), or rethink the assumptions in
that other code.

As for the lack of usage bump in the back-branches, I think the
options are: teach PinBuffer_Locked() to increment it optionally, or
back-patch whatever we come up with for this.

For the root buffer optimisation, the obvious place for storage seems
to be under rd_amcache.  It was originally invented for the cached
metapage (commit d2896a9ed14) but could accommodate a new struct
holding whatever we want.  Here is a patch to try that out.
BTAMCacheData would also be a natural place to put future things
including a copy of the root page itself, in later work on lock-free
tricks.

Experimental patches attached.
From b91d996c691077a7fa48718c933159136d650e77 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 29 Jun 2023 10:52:56 +1200
Subject: [PATCH 1/2] Improve ReadRecentBuffer() scalability.

While testing a new potential use for ReadRecentBuffer(), Andres
reported that it scales badly when called concurrently for the same
buffer by many backends.  Instead of a naive (but wrong) coding with
PinBuffer(), it used the spinlock, so that it could be careful to pin
only if the buffer was valid and holding the expected block, to avoid
breaking invariants in eg GetVictimBuffer().  Unfortunately that made it
less scalable than PinBuffer(), which uses compare-exchange instead.

We can fix that by giving PinBuffer() a new skip_if_not_valid mode that
doesn't pin invalid buffers.  It might occasionally skip when it
shouldn't due to the unlocked read of the header flags, but that's
unlikely and perfectly acceptable for an opportunistic optimisation
routine, and it can only succeed when it really should due to the
compare-exchange loop.

XXX This also fixes ReadRecentBuffer()'s failure to bump the usage
count.  Fix separately or back-patch this?

Reported-by: Andres Freund <and...@anarazel.de>
Discussion: https://postgr.es/m/20230627020546.t6z4tntmj7wmjrfh%40awork3.anarazel.de
---
 src/backend/storage/buffer/bufmgr.c | 60 ++++++++++++-----------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3c59bbd04e..0df9a1ec30 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -467,7 +467,8 @@ static BlockNumber ExtendBufferedRelShared(ExtendBufferedWhat eb,
 										   BlockNumber extend_upto,
 										   Buffer *buffers,
 										   uint32 *extended_by);
-static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
+static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
+					  bool skip_if_not_valid);
 static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf);
 static void BufferSync(int flags);
@@ -635,7 +636,6 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 	BufferDesc *bufHdr;
 	BufferTag	tag;
 	uint32		buf_state;
-	bool		have_private_ref;
 
 	Assert(BufferIsValid(recent_buffer));
 
@@ -663,38 +663,17 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 	else
 	{
 		bufHdr = GetBufferDescriptor(recent_buffer - 1);
-		have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
 
-		/*
-		 * Do we already have this buffer pinned with a private reference?  If
-		 * so, it must be valid and it is safe to check the tag without
-		 * locking.  If not, we have to lock the header first and then check.
-		 */
-		if (have_private_ref)
-			buf_state = pg_atomic_read_u32(&bufHdr->state);
-		else
-			buf_state = LockBufHdr(bufHdr);
-
-		if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
+		/* Is it still valid and holding the right tag? */
+		if (PinBuffer(bufHdr, NULL, true))
 		{
-			/*
-			 * It's now safe to pin the buffer.  We can't pin first and ask
-			 * questions later, because it might confuse code paths like
-			 * InvalidateBuffer() if we pinned a random non-matching buffer.
-			 */
-			if (have_private_ref)
-				PinBuffer(bufHdr, NULL);	/* bump pin count */
-			else
-				PinBuffer_Locked(bufHdr);	/* pin for first time */
-
-			pgBufferUsage.shared_blks_hit++;
-
-			return true;
+			if (BufferTagsEqual(&tag, &bufHdr->tag))
+			{
+				pgBufferUsage.shared_blks_hit++;
+				return true;
+			}
+			UnpinBuffer(bufHdr);
 		}
-
-		/* If we locked the header above, now unlock. */
-		if (!have_private_ref)
-			UnlockBufHdr(bufHdr, buf_state);
 	}
 
 	return false;
@@ -1252,7 +1231,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		buf = GetBufferDescriptor(existing_buf_id);
 
-		valid = PinBuffer(buf, strategy);
+		valid = PinBuffer(buf, strategy, false);
 
 		/* Can release the mapping lock as soon as we've pinned it */
 		LWLockRelease(newPartitionLock);
@@ -1330,7 +1309,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 		existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
 
-		valid = PinBuffer(existing_buf_hdr, strategy);
+		valid = PinBuffer(existing_buf_hdr, strategy, false);
 
 		/* Can release the mapping lock as soon as we've pinned it */
 		LWLockRelease(newPartitionLock);
@@ -1979,7 +1958,7 @@ ExtendBufferedRelShared(ExtendBufferedWhat eb,
 			 * Pin the existing buffer before releasing the partition lock,
 			 * preventing it from being evicted.
 			 */
-			valid = PinBuffer(existing_hdr, strategy);
+			valid = PinBuffer(existing_hdr, strategy, false);
 
 			LWLockRelease(partition_lock);
 
@@ -2225,10 +2204,13 @@ ReleaseAndReadBuffer(Buffer buffer,
  * Note that ResourceOwnerEnlargeBuffers must have been done already.
  *
  * Returns true if buffer is BM_VALID, else false.  This provision allows
- * some callers to avoid an extra spinlock cycle.
+ * some callers to avoid an extra spinlock cycle.  If skip_if_not_valid is
+ * true, then a false return value also indicates that the buffer was
+ * (recently) invalid and has not been pinned.
  */
 static bool
-PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
+PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
+		  bool skip_if_not_valid)
 {
 	Buffer		b = BufferDescriptorGetBuffer(buf);
 	bool		result;
@@ -2252,6 +2234,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 			if (old_buf_state & BM_LOCKED)
 				old_buf_state = WaitBufHdrUnlocked(buf);
 
+			if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
+			{
+				ForgetPrivateRefCountEntry(ref);
+				return false;
+			}
+
 			buf_state = old_buf_state;
 
 			/* increase refcount */
-- 
2.40.1

From fdaa0873a37545cf42a90b9ede562bcbc2d72947 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 29 Jun 2023 12:27:52 +1200
Subject: [PATCH 2/2] Use ReadRecentBuffer() for btree root page.

The root page of a btree is accessed on every index scan, so it gets
very hot.  We can measure a speed-up on many workloads by pinning it
with ReadRecentBuffer() instead of ReadBuffer(), after remembering where
it was last time in the AM-private cache space in rel->rd_amcache.

Rearrange the existing use of rd_amcache into a new struct
BTAMCacheData.  It's likely that we'll find more things to put in there
in future work.

Discussion: https://postgr.es/m/20230627020546.t6z4tntmj7wmjrfh%40awork3.anarazel.de
---
 src/backend/access/nbtree/nbtpage.c | 93 +++++++++++++++++++++--------
 src/include/access/nbtree.h         | 10 ++++
 src/tools/pgindent/typedefs.list    |  1 +
 3 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index d78971bfe8..bf270874d2 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -311,6 +311,29 @@ _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
 	_bt_relbuf(rel, metabuf);
 }
 
+/*
+ * Get our private per-relation cache area.
+ */
+static inline BTAMCacheData *
+_bt_getcache(Relation rel)
+{
+	BTAMCacheData *amcache;
+
+	if (unlikely(rel->rd_amcache == NULL))
+	{
+		/* Set up cache on first time through. */
+		amcache = (BTAMCacheData *)
+			MemoryContextAlloc(rel->rd_indexcxt, sizeof(*amcache));
+		amcache->meta_page_is_valid = false;
+		amcache->recent_root_buffer = InvalidBuffer;
+		rel->rd_amcache = amcache;
+	}
+	else
+		amcache = (BTAMCacheData *) rel->rd_amcache;
+
+	return amcache;
+}
+
 /*
  *	_bt_getroot() -- Get the root page of the btree.
  *
@@ -350,17 +373,21 @@ _bt_getroot(Relation rel, Relation heaprel, int access)
 	BlockNumber rootblkno;
 	uint32		rootlevel;
 	BTMetaPageData *metad;
+	BTAMCacheData *amcache;
 
 	Assert(access == BT_READ || heaprel != NULL);
 
+	amcache = _bt_getcache(rel);
+
 	/*
 	 * Try to use previously-cached metapage data to find the root.  This
 	 * normally saves one buffer access per index search, which is a very
 	 * helpful savings in bufmgr traffic and hence contention.
 	 */
-	if (rel->rd_amcache != NULL)
+	if (amcache->meta_page_is_valid)
 	{
-		metad = (BTMetaPageData *) rel->rd_amcache;
+		metad = &amcache->meta_page;
+
 		/* We shouldn't have cached it if any of these fail */
 		Assert(metad->btm_magic == BTREE_MAGIC);
 		Assert(metad->btm_version >= BTREE_MIN_VERSION);
@@ -373,7 +400,25 @@ _bt_getroot(Relation rel, Relation heaprel, int access)
 		Assert(rootblkno != P_NONE);
 		rootlevel = metad->btm_fastlevel;
 
-		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+		/* Try to find the root page in the buffer it was last seen in. */
+		if (BufferIsValid(amcache->recent_root_buffer) &&
+			ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, rootblkno,
+							 amcache->recent_root_buffer))
+		{
+			/*
+			 * It's in the same buffer as last time, and we avoided a trip
+			 * through the buffer map.
+			 */
+			rootbuf = amcache->recent_root_buffer;
+			_bt_lockbuf(rel, rootbuf, BT_READ);
+			_bt_checkpage(rel, rootbuf);
+		}
+		else
+		{
+			/* Slow path.  Remember where it is for next time. */
+			rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+			amcache->recent_root_buffer = rootbuf;
+		}
 		rootpage = BufferGetPage(rootbuf);
 		rootopaque = BTPageGetOpaque(rootpage);
 
@@ -393,10 +438,8 @@ _bt_getroot(Relation rel, Relation heaprel, int access)
 			return rootbuf;
 		}
 		_bt_relbuf(rel, rootbuf);
-		/* Cache is stale, throw it away */
-		if (rel->rd_amcache)
-			pfree(rel->rd_amcache);
-		rel->rd_amcache = NULL;
+		/* Cache is stale, mark it invalid. */
+		amcache->meta_page_is_valid = false;
 	}
 
 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
@@ -523,9 +566,8 @@ _bt_getroot(Relation rel, Relation heaprel, int access)
 		/*
 		 * Cache the metapage data for next time
 		 */
-		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
-											 sizeof(BTMetaPageData));
-		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		amcache->meta_page = *metad;
+		amcache->meta_page_is_valid = true;
 
 		/*
 		 * We are done with the metapage; arrange to release it via first
@@ -588,16 +630,16 @@ _bt_gettrueroot(Relation rel)
 	BlockNumber rootblkno;
 	uint32		rootlevel;
 	BTMetaPageData *metad;
+	BTAMCacheData *amcache;
 
 	/*
 	 * We don't try to use cached metapage data here, since (a) this path is
 	 * not performance-critical, and (b) if we are here it suggests our cache
 	 * is out-of-date anyway.  In light of point (b), it's probably safest to
-	 * actively flush any cached metapage info.
+	 * actively invalidate any cached metapage info.
 	 */
-	if (rel->rd_amcache)
-		pfree(rel->rd_amcache);
-	rel->rd_amcache = NULL;
+	amcache = _bt_getcache(rel);
+	amcache->meta_page_is_valid = false;
 
 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 	metapg = BufferGetPage(metabuf);
@@ -674,9 +716,12 @@ _bt_gettrueroot(Relation rel)
 int
 _bt_getrootheight(Relation rel)
 {
+	BTAMCacheData *amcache;
 	BTMetaPageData *metad;
 
-	if (rel->rd_amcache == NULL)
+	amcache = _bt_getcache(rel);
+
+	if (!amcache->meta_page_is_valid)
 	{
 		Buffer		metabuf;
 
@@ -697,14 +742,13 @@ _bt_getrootheight(Relation rel)
 		/*
 		 * Cache the metapage data for next time
 		 */
-		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
-											 sizeof(BTMetaPageData));
-		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		amcache->meta_page = *metad;
+		amcache->meta_page_is_valid = true;
 		_bt_relbuf(rel, metabuf);
 	}
 
 	/* Get cached page */
-	metad = (BTMetaPageData *) rel->rd_amcache;
+	metad = &amcache->meta_page;
 	/* We shouldn't have cached it if any of these fail */
 	Assert(metad->btm_magic == BTREE_MAGIC);
 	Assert(metad->btm_version >= BTREE_MIN_VERSION);
@@ -738,9 +782,11 @@ _bt_getrootheight(Relation rel)
 void
 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
 {
+	BTAMCacheData *amcache;
 	BTMetaPageData *metad;
 
-	if (rel->rd_amcache == NULL)
+	amcache = _bt_getcache(rel);
+	if (!amcache->meta_page_is_valid)
 	{
 		Buffer		metabuf;
 
@@ -770,14 +816,13 @@ _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
 		 * from version 2 to version 3, both of which are !heapkeyspace
 		 * versions.
 		 */
-		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
-											 sizeof(BTMetaPageData));
-		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
+		amcache->meta_page = *metad;
+		amcache->meta_page_is_valid = true;
 		_bt_relbuf(rel, metabuf);
 	}
 
 	/* Get cached page */
-	metad = (BTMetaPageData *) rel->rd_amcache;
+	metad = &amcache->meta_page;
 	/* We shouldn't have cached it if any of these fail */
 	Assert(metad->btm_magic == BTREE_MAGIC);
 	Assert(metad->btm_version >= BTREE_MIN_VERSION);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 8891fa7973..85cab606a3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -151,6 +151,16 @@ typedef struct BTMetaPageData
 #define BTREE_MIN_VERSION	2	/* minimum supported version */
 #define BTREE_NOVAC_VERSION	3	/* version with all meta fields set */
 
+/*
+ * Cache space, stored in rel->rd_amcache.
+ */
+typedef struct BTAMCacheData
+{
+	BTMetaPageData meta_page;
+	bool		meta_page_is_valid;
+	Buffer		recent_root_buffer;
+} BTAMCacheData;
+
 /*
  * Maximum size of a btree index entry, including its tuple header.
  *
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 260854747b..b75d9a5cb2 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -187,6 +187,7 @@ BOOL
 BOOLEAN
 BOX
 BTArrayKeyInfo
+BTAMCacheData
 BTBuildState
 BTCycleId
 BTDedupInterval
-- 
2.40.1

Reply via email to