From be039fdf42ba3bf6637697f6fec38874f94d09b6 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Fri, 11 Jul 2025 09:05:45 -0400
Subject: [PATCH v2 2/2] Remove the buffer_strategy_lock.

With the removal of the freelist the remaining items in the
BufferStrategyControl structure no longer require strict coordination.  Atomic
operations will suffice.
---
 src/backend/storage/buffer/README     | 39 +++++++++----------
 src/backend/storage/buffer/bufmgr.c   |  8 ++--
 src/backend/storage/buffer/freelist.c | 56 ++++++---------------------
 src/backend/storage/buffer/localbuf.c |  2 +-
 src/include/storage/buf_internals.h   |  2 +-
 5 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index cd52effd911..a60f77d7ee9 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -127,11 +127,10 @@ bits of the tag's hash value.  The rules stated above apply to each partition
 independently.  If it is necessary to lock more than one partition at a time,
 they must be locked in partition-number order to avoid risk of deadlock.
 
-* A separate system-wide spinlock, buffer_strategy_lock, provides mutual
-exclusion for operations that select buffers for replacement.  A spinlock is
-used here rather than a lightweight lock for efficiency; no other locks of any
-sort should be acquired while buffer_strategy_lock is held.  This is essential
-to allow buffer replacement to happen in multiple backends with reasonable
+* Operations that select buffers for replacement don't require a lock, but
+rather use atomic operations to ensure coordination across backends when
+accessing members of the BufferStrategyControl datastructure. This allows
+buffer replacement to happen in multiple backends with reasonable
 concurrency.
 
 * Each buffer header contains a spinlock that must be taken when examining
@@ -173,14 +172,12 @@ buffer_strategy_lock.
 
 The algorithm for a process that needs to obtain a victim buffer is:
 
-1. Obtain buffer_strategy_lock.
+1. Select the buffer pointed to by nextVictimBuffer, and circularly advance
+nextVictimBuffer for next time.
 
-2. Select the buffer pointed to by nextVictimBuffer, and circularly advance
-nextVictimBuffer for next time. Release buffer_strategy_lock.
-
-3. If the selected buffer is pinned or has a nonzero usage count, it cannot
-be used.  Decrement its usage count (if nonzero), reacquire
-buffer_strategy_lock, and return to step 3 to examine the next buffer.
+2. If the selected buffer is pinned or has a nonzero usage count, it cannot be
+used.  Decrement its usage count (if nonzero), return to step 3 to examine the
+next buffer.
 
 4. Pin the selected buffer, and return.
 
@@ -196,9 +193,9 @@ Buffer Ring Replacement Strategy
 When running a query that needs to access a large number of pages just once,
 such as VACUUM or a large sequential scan, a different strategy is used.
 A page that has been touched only by such a scan is unlikely to be needed
-again soon, so instead of running the normal clock sweep algorithm and
+again soon, so instead of running the normal clock-sweep algorithm and
 blowing out the entire buffer cache, a small ring of buffers is allocated
-using the normal clock sweep algorithm and those buffers are reused for the
+using the normal clock-sweep algorithm and those buffers are reused for the
 whole scan.  This also implies that much of the write traffic caused by such
 a statement will be done by the backend itself and not pushed off onto other
 processes.
@@ -244,13 +241,13 @@ nextVictimBuffer (which it does not change!), looking for buffers that are
 dirty and not pinned nor marked with a positive usage count.  It pins,
 writes, and releases any such buffer.
 
-If we can assume that reading nextVictimBuffer is an atomic action, then
-the writer doesn't even need to take buffer_strategy_lock in order to look
-for buffers to write; it needs only to spinlock each buffer header for long
-enough to check the dirtybit.  Even without that assumption, the writer
-only needs to take the lock long enough to read the variable value, not
-while scanning the buffers.  (This is a very substantial improvement in
-the contention cost of the writer compared to PG 8.0.)
+We enforce reading nextVictimBuffer within an atomic action so that the writer
+doesn't even need to take buffer_strategy_lock in order to look for buffers to
+write; it needs only to spinlock each buffer header for long enough to check
+the dirtybit.  Even without that assumption, the writer only needs to take the
+lock long enough to read the variable value, not while scanning the buffers.
+(This is a very substantial improvement in the contention cost of the writer
+compared to PG 8.0.)
 
 The background writer takes shared content lock on a buffer while writing it
 out (and anyone else who flushes buffer contents to disk must do so too).
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9c059441a5c..d068f77362d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3593,7 +3593,7 @@ BufferSync(int flags)
  * This is called periodically by the background writer process.
  *
  * Returns true if it's appropriate for the bgwriter process to go into
- * low-power hibernation mode.  (This happens if the strategy clock sweep
+ * low-power hibernation mode.  (This happens if the strategy clock-sweep
  * has been "lapped" and no buffer allocations have occurred recently,
  * or if the bgwriter has been effectively disabled by setting
  * bgwriter_lru_maxpages to 0.)
@@ -3643,7 +3643,7 @@ BgBufferSync(WritebackContext *wb_context)
 	uint32		new_recent_alloc;
 
 	/*
-	 * Find out where the clock sweep currently is, and how many buffer
+	 * Find out where the clock-sweep currently is, and how many buffer
 	 * allocations have happened since our last call.
 	 */
 	strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
@@ -3664,8 +3664,8 @@ BgBufferSync(WritebackContext *wb_context)
 
 	/*
 	 * Compute strategy_delta = how many buffers have been scanned by the
-	 * clock sweep since last time.  If first time through, assume none. Then
-	 * see if we are still ahead of the clock sweep, and if so, how many
+	 * clock-sweep since last time.  If first time through, assume none. Then
+	 * see if we are still ahead of the clock-sweep, and if so, how many
 	 * buffers we could scan before we'd catch up with it and "lap" it. Note:
 	 * weird-looking coding of xxx_passes comparisons are to avoid bogus
 	 * behavior when the passes counts wrap around.
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index a228ff27377..267a9d84df3 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -29,11 +29,8 @@
  */
 typedef struct
 {
-	/* Spinlock: protects the values below */
-	slock_t		buffer_strategy_lock;
-
 	/*
-	 * Clock sweep hand: index of next buffer to consider grabbing. Note that
+	 * Clock-sweep hand: index of next buffer to consider grabbing. Note that
 	 * this isn't a concrete buffer - we only ever increase the value. So, to
 	 * get an actual buffer, it needs to be used modulo NBuffers.
 	 */
@@ -43,7 +40,7 @@ typedef struct
 	 * Statistics.  These counters should be wide enough that they can't
 	 * overflow during a single bgwriter cycle.
 	 */
-	uint32		completePasses; /* Complete cycles of the clock sweep */
+	pg_atomic_uint32 completePasses;	/* Complete cycles of the clock-sweep */
 	pg_atomic_uint32 numBufferAllocs;	/* Buffers allocated since last reset */
 
 	/*
@@ -116,12 +113,7 @@ ClockSweepTick(void)
 		/* always wrap what we look up in BufferDescriptors */
 		victim = victim % NBuffers;
 
-		/*
-		 * If we're the one that just caused a wraparound, force
-		 * completePasses to be incremented while holding the spinlock. We
-		 * need the spinlock so StrategySyncStart() can return a consistent
-		 * value consisting of nextVictimBuffer and completePasses.
-		 */
+		/* Increment completePasses if we just caused a wraparound */
 		if (victim == 0)
 		{
 			uint32		expected;
@@ -132,23 +124,12 @@ ClockSweepTick(void)
 
 			while (!success)
 			{
-				/*
-				 * Acquire the spinlock while increasing completePasses. That
-				 * allows other readers to read nextVictimBuffer and
-				 * completePasses in a consistent manner which is required for
-				 * StrategySyncStart().  In theory delaying the increment
-				 * could lead to an overflow of nextVictimBuffers, but that's
-				 * highly unlikely and wouldn't be particularly harmful.
-				 */
-				SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
 				wrapped = expected % NBuffers;
 
 				success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
 														 &expected, wrapped);
 				if (success)
-					StrategyControl->completePasses++;
-				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+					pg_atomic_fetch_add_u32(&StrategyControl->completePasses, 1);
 			}
 		}
 	}
@@ -177,10 +158,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 
 	*from_ring = false;
 
-	/*
-	 * If given a strategy object, see whether it can select a buffer. We
-	 * assume strategy objects don't need buffer_strategy_lock.
-	 */
+	/* If given a strategy object, see whether it can select a buffer */
 	if (strategy != NULL)
 	{
 		buf = GetBufferFromRing(strategy, buf_state);
@@ -225,7 +203,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	 */
 	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
 
-	/* Use the "clock sweep" algorithm to find a free buffer */
+	/* Use the "clock-sweep" algorithm to find a free buffer */
 	trycounter = NBuffers;
 	for (;;)
 	{
@@ -287,13 +265,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 	uint32		nextVictimBuffer;
 	int			result;
 
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 	nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
 	result = nextVictimBuffer % NBuffers;
 
 	if (complete_passes)
 	{
-		*complete_passes = StrategyControl->completePasses;
+		*complete_passes = pg_atomic_read_u32(&StrategyControl->completePasses);
 
 		/*
 		 * Additionally add the number of wraparounds that happened before
@@ -306,7 +283,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 	{
 		*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
 	}
-	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+
 	return result;
 }
 
@@ -321,21 +298,14 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 void
 StrategyNotifyBgWriter(int bgwprocno)
 {
-	/*
-	 * We acquire buffer_strategy_lock just to ensure that the store appears
-	 * atomic to StrategyGetBuffer.  The bgwriter should call this rather
-	 * infrequently, so there's no performance penalty from being safe.
-	 */
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 	StrategyControl->bgwprocno = bgwprocno;
-	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 }
 
 
 /*
  * StrategyShmemSize
  *
- * estimate the size of shared memory used by the freelist-related structures.
+ * Estimate the size of shared memory used by the freelist-related structures.
  *
  * Note: for somewhat historical reasons, the buffer lookup hashtable size
  * is also determined here.
@@ -393,13 +363,11 @@ StrategyInitialize(bool init)
 		 */
 		Assert(init);
 
-		SpinLockInit(&StrategyControl->buffer_strategy_lock);
-
-		/* Initialize the clock sweep pointer */
+		/* Initialize the clock-sweep pointer */
 		pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
 
 		/* Clear statistics */
-		StrategyControl->completePasses = 0;
+		pg_atomic_init_u32(&StrategyControl->completePasses, 0);
 		pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
 
 		/* No pending notification */
@@ -643,7 +611,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 	 *
 	 * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
 	 * since our own previous usage of the ring element would have left it
-	 * there, but it might've been decremented by clock sweep since then). A
+	 * there, but it might've been decremented by clock-sweep since then). A
 	 * higher usage_count indicates someone else has touched the buffer, so we
 	 * shouldn't re-use it.
 	 */
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3da9c41ee1d..7a34f5e430a 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -229,7 +229,7 @@ GetLocalVictimBuffer(void)
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
 	/*
-	 * Need to get a new buffer.  We use a clock sweep algorithm (essentially
+	 * Need to get a new buffer.  We use a clock-sweep algorithm (essentially
 	 * the same as what freelist.c does now...)
 	 */
 	trycounter = NLocBuffer;
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 00eade63971..133a0dd7fd5 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -81,7 +81,7 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32,
  * accuracy and speed of the clock-sweep buffer management algorithm.  A
  * large value (comparable to NBuffers) would approximate LRU semantics.
  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
- * clock sweeps to find a free buffer, so in practice we don't want the
+ * clock-sweeps to find a free buffer, so in practice we don't want the
  * value to be very large.
  */
 #define BM_MAX_USAGE_COUNT	5
-- 
2.49.0