From 8c0733e8f0ddddd97e461057fabb254a11d32dd8 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Thu, 10 Jul 2025 14:45:32 -0400
Subject: [PATCH v13 2/4] Eliminate the freelist from the buffer manager and
 depend on clock-sweep

This set of changes removes the list of available buffers and instead
simply uses the clock-sweep algorithm to find and return an available
buffer.  While on the surface this appears to be removing an
optimization it is in fact eliminating code that induces overhead in the
form of synchronization that is problemmatic for multi-core systems.
This also removes the have_free_buffer() function and simply caps the
pg_autoprewarm process to at most NBuffers.
---
 contrib/pg_prewarm/autoprewarm.c      |  31 ++++---
 src/backend/storage/buffer/README     |  40 +++------
 src/backend/storage/buffer/buf_init.c |   9 --
 src/backend/storage/buffer/bufmgr.c   |  29 +------
 src/backend/storage/buffer/freelist.c | 119 +-------------------------
 src/include/storage/buf_internals.h   |  12 +--
 6 files changed, 32 insertions(+), 208 deletions(-)

diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index c01b9c7e6a4..2722b0bb443 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -370,6 +370,16 @@ apw_load_buffers(void)
 	apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
 	apw_state->prewarmed_blocks = 0;
 
+
+	/* Don't prewarm more than we can fit. */
+	if (num_elements > NBuffers)
+	{
+		num_elements = NBuffers;
+		ereport(LOG,
+				(errmsg("autoprewarm: capping prewarmed blocks to %d (shared_buffers size)",
+						NBuffers)));
+	}
+
 	/* Get the info position of the first block of the next database. */
 	while (apw_state->prewarm_start_idx < num_elements)
 	{
@@ -410,10 +420,6 @@ apw_load_buffers(void)
 		apw_state->database = current_db;
 		Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
 
-		/* If we've run out of free buffers, don't launch another worker. */
-		if (!have_free_buffer())
-			break;
-
 		/*
 		 * Likewise, don't launch if we've already been told to shut down.
 		 * (The launch would fail anyway, but we might as well skip it.)
@@ -462,12 +468,6 @@ apw_read_stream_next_block(ReadStream *stream,
 	{
 		BlockInfoRecord blk = p->block_info[p->pos];
 
-		if (!have_free_buffer())
-		{
-			p->pos = apw_state->prewarm_stop_idx;
-			return InvalidBlockNumber;
-		}
-
 		if (blk.tablespace != p->tablespace)
 			return InvalidBlockNumber;
 
@@ -523,10 +523,10 @@ autoprewarm_database_main(Datum main_arg)
 	blk = block_info[i];
 
 	/*
-	 * Loop until we run out of blocks to prewarm or until we run out of free
+	 * Loop until we run out of blocks to prewarm or until we run out of
 	 * buffers.
 	 */
-	while (i < apw_state->prewarm_stop_idx && have_free_buffer())
+	while (i < apw_state->prewarm_stop_idx)
 	{
 		Oid			tablespace = blk.tablespace;
 		RelFileNumber filenumber = blk.filenumber;
@@ -568,14 +568,13 @@ autoprewarm_database_main(Datum main_arg)
 
 		/*
 		 * We have a relation; now let's loop until we find a valid fork of
-		 * the relation or we run out of free buffers. Once we've read from
-		 * all valid forks or run out of options, we'll close the relation and
+		 * the relation or we run out of buffers. Once we've read from all
+		 * valid forks or run out of options, we'll close the relation and
 		 * move on.
 		 */
 		while (i < apw_state->prewarm_stop_idx &&
 			   blk.tablespace == tablespace &&
-			   blk.filenumber == filenumber &&
-			   have_free_buffer())
+			   blk.filenumber == filenumber)
 		{
 			ForkNumber	forknum = blk.forknum;
 			BlockNumber nblocks;
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 4b13da5d7ad..119f31b5d65 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -128,11 +128,11 @@ independently.  If it is necessary to lock more than one partition at a time,
 they must be locked in partition-number order to avoid risk of deadlock.
 
 * A separate system-wide spinlock, buffer_strategy_lock, provides mutual
-exclusion for operations that access the buffer free list or select
-buffers for replacement.  A spinlock is used here rather than a lightweight
-lock for efficiency; no other locks of any sort should be acquired while
-buffer_strategy_lock is held.  This is essential to allow buffer replacement
-to happen in multiple backends with reasonable concurrency.
+exclusion for operations that select buffers for replacement.  A spinlock is
+used here rather than a lightweight lock for efficiency; no other locks of any
+sort should be acquired while buffer_strategy_lock is held.  This is essential
+to allow buffer replacement to happen in multiple backends with reasonable
+concurrency.
 
 * Each buffer header contains a spinlock that must be taken when examining
 or changing fields of that buffer header.  This allows operations such as
@@ -158,18 +158,8 @@ unset by sleeping on the buffer's condition variable.
 Normal Buffer Replacement Strategy
 ----------------------------------
 
-There is a "free list" of buffers that are prime candidates for replacement.
-In particular, buffers that are completely free (contain no valid page) are
-always in this list.  We could also throw buffers into this list if we
-consider their pages unlikely to be needed soon; however, the current
-algorithm never does that.  The list is singly-linked using fields in the
-buffer headers; we maintain head and tail pointers in global variables.
-(Note: although the list links are in the buffer headers, they are
-considered to be protected by the buffer_strategy_lock, not the buffer-header
-spinlocks.)  To choose a victim buffer to recycle when there are no free
-buffers available, we use a simple clock-sweep algorithm, which avoids the
-need to take system-wide locks during common operations.  It works like
-this:
+To choose a victim buffer to recycle we use a simple clock-sweep algorithm.  It
+works like this:
 
 Each buffer header contains a usage counter, which is incremented (up to a
 small limit value) whenever the buffer is pinned.  (This requires only the
@@ -184,20 +174,14 @@ The algorithm for a process that needs to obtain a victim buffer is:
 
 1. Obtain buffer_strategy_lock.
 
-2. If buffer free list is nonempty, remove its head buffer.  Release
-buffer_strategy_lock.  If the buffer is pinned or has a nonzero usage count,
-it cannot be used; ignore it go back to step 1.  Otherwise, pin the buffer,
-and return it.
+2. Select the buffer pointed to by nextVictimBuffer, and circularly advance
+nextVictimBuffer for next time. Release buffer_strategy_lock.
 
-3. Otherwise, the buffer free list is empty.  Select the buffer pointed to by
-nextVictimBuffer, and circularly advance nextVictimBuffer for next time.
-Release buffer_strategy_lock.
-
-4. If the selected buffer is pinned or has a nonzero usage count, it cannot
+3. If the selected buffer is pinned or has a nonzero usage count, it cannot
 be used.  Decrement its usage count (if nonzero), reacquire
 buffer_strategy_lock, and return to step 3 to examine the next buffer.
 
-5. Pin the selected buffer, and return.
+4. Pin the selected buffer, and return.
 
 (Note that if the selected buffer is dirty, we will have to write it out
 before we can recycle it; if someone else pins the buffer meanwhile we will
@@ -234,7 +218,7 @@ the ring strategy effectively degrades to the normal strategy.
 
 VACUUM uses a ring like sequential scans, however, the size of this ring is
 controlled by the vacuum_buffer_usage_limit GUC.  Dirty pages are not removed
-from the ring.  Instead, WAL is flushed if needed to allow reuse of the
+from the ring.  Instead, the WAL is flushed if needed to allow reuse of the
 buffers.  Before introducing the buffer ring strategy in 8.3, VACUUM's buffers
 were sent to the freelist, which was effectively a buffer ring of 1 buffer,
 resulting in excessive WAL flushing.
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index ed1dc488a42..6fd3a6bbac5 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -128,20 +128,11 @@ BufferManagerShmemInit(void)
 
 			pgaio_wref_clear(&buf->io_wref);
 
-			/*
-			 * Initially link all the buffers together as unused. Subsequent
-			 * management of this list is done by freelist.c.
-			 */
-			buf->freeNext = i + 1;
-
 			LWLockInitialize(BufferDescriptorGetContentLock(buf),
 							 LWTRANCHE_BUFFER_CONTENT);
 
 			ConditionVariableInit(BufferDescriptorGetIOCV(buf));
 		}
-
-		/* Correct last entry of linked list */
-		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
 	}
 
 	/* Init other shared buffer-management stuff */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 396b053b3fa..719a5bb6f97 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2094,12 +2094,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		UnpinBuffer(victim_buf_hdr);
 
-		/*
-		 * The victim buffer we acquired previously is clean and unused, let
-		 * it be found again quickly
-		 */
-		StrategyFreeBuffer(victim_buf_hdr);
-
 		/* remaining code should match code at top of routine */
 
 		existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
@@ -2158,8 +2152,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 }
 
 /*
- * InvalidateBuffer -- mark a shared buffer invalid and return it to the
- * freelist.
+ * InvalidateBuffer -- mark a shared buffer invalid.
  *
  * The buffer header spinlock must be held at entry.  We drop it before
  * returning.  (This is sane because the caller must have locked the
@@ -2257,11 +2250,6 @@ retry:
 	 * Done with mapping lock.
 	 */
 	LWLockRelease(oldPartitionLock);
-
-	/*
-	 * Insert the buffer at the head of the list of free buffers.
-	 */
-	StrategyFreeBuffer(buf);
 }
 
 /*
@@ -2679,11 +2667,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		{
 			BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
 
-			/*
-			 * The victim buffer we acquired previously is clean and unused,
-			 * let it be found again quickly
-			 */
-			StrategyFreeBuffer(buf_hdr);
 			UnpinBuffer(buf_hdr);
 		}
 
@@ -2756,12 +2739,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 			valid = PinBuffer(existing_hdr, strategy);
 
 			LWLockRelease(partition_lock);
-
-			/*
-			 * The victim buffer we acquired previously is clean and unused,
-			 * let it be found again quickly
-			 */
-			StrategyFreeBuffer(victim_buf_hdr);
 			UnpinBuffer(victim_buf_hdr);
 
 			buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
@@ -3658,8 +3635,8 @@ BgBufferSync(WritebackContext *wb_context)
 	uint32		new_recent_alloc;
 
 	/*
-	 * Find out where the freelist clock-sweep currently is, and how many
-	 * buffer allocations have happened since our last call.
+	 * Find out where the clock-sweep currently is, and how many buffer
+	 * allocations have happened since our last call.
 	 */
 	strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
 
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index cd94a7d8a7b..7d59a92bd1a 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -39,14 +39,6 @@ typedef struct
 	 */
 	pg_atomic_uint32 nextVictimBuffer;
 
-	int			firstFreeBuffer;	/* Head of list of unused buffers */
-	int			lastFreeBuffer; /* Tail of list of unused buffers */
-
-	/*
-	 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
-	 * when the list is empty)
-	 */
-
 	/*
 	 * Statistics.  These counters should be wide enough that they can't
 	 * overflow during a single bgwriter cycle.
@@ -163,23 +155,6 @@ ClockSweepTick(void)
 	return victim;
 }
 
-/*
- * have_free_buffer -- a lockless check to see if there is a free buffer in
- *					   buffer pool.
- *
- * If the result is true that will become stale once free buffers are moved out
- * by other operations, so the caller who strictly want to use a free buffer
- * should not call this.
- */
-bool
-have_free_buffer(void)
-{
-	if (StrategyControl->firstFreeBuffer >= 0)
-		return true;
-	else
-		return false;
-}
-
 /*
  * StrategyGetBuffer
  *
@@ -249,69 +224,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	 */
 	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
 
-	/*
-	 * First check, without acquiring the lock, whether there's buffers in the
-	 * freelist. Since we otherwise don't require the spinlock in every
-	 * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
-	 * uselessly in most cases. That obviously leaves a race where a buffer is
-	 * put on the freelist but we don't see the store yet - but that's pretty
-	 * harmless, it'll just get used during the next buffer acquisition.
-	 *
-	 * If there's buffers on the freelist, acquire the spinlock to pop one
-	 * buffer of the freelist. Then check whether that buffer is usable and
-	 * repeat if not.
-	 *
-	 * Note that the freeNext fields are considered to be protected by the
-	 * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
-	 * manipulate them without holding the spinlock.
-	 */
-	if (StrategyControl->firstFreeBuffer >= 0)
-	{
-		while (true)
-		{
-			/* Acquire the spinlock to remove element from the freelist */
-			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
-			if (StrategyControl->firstFreeBuffer < 0)
-			{
-				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-				break;
-			}
-
-			buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
-			Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
-
-			/* Unconditionally remove buffer from freelist */
-			StrategyControl->firstFreeBuffer = buf->freeNext;
-			buf->freeNext = FREENEXT_NOT_IN_LIST;
-
-			/*
-			 * Release the lock so someone else can access the freelist while
-			 * we check out this buffer.
-			 */
-			SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-
-			/*
-			 * If the buffer is pinned or has a nonzero usage_count, we cannot
-			 * use it; discard it and retry.  (This can only happen if VACUUM
-			 * put a valid buffer in the freelist and then someone else used
-			 * it before we got to it.  It's probably impossible altogether as
-			 * of 8.3, but we'd better check anyway.)
-			 */
-			local_buf_state = LockBufHdr(buf);
-			if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
-				&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
-			{
-				if (strategy != NULL)
-					AddBufferToRing(strategy, buf);
-				*buf_state = local_buf_state;
-				return buf;
-			}
-			UnlockBufHdr(buf, local_buf_state);
-		}
-	}
-
-	/* Nothing on the freelist, so run the "clock-sweep" algorithm */
+	/* Use the "clock sweep" algorithm to find a free buffer */
 	trycounter = NBuffers;
 	for (;;)
 	{
@@ -356,29 +269,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	}
 }
 
-/*
- * StrategyFreeBuffer: put a buffer on the freelist
- */
-void
-StrategyFreeBuffer(BufferDesc *buf)
-{
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
-	/*
-	 * It is possible that we are told to put something in the freelist that
-	 * is already in it; don't screw up the list if so.
-	 */
-	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
-	{
-		buf->freeNext = StrategyControl->firstFreeBuffer;
-		if (buf->freeNext < 0)
-			StrategyControl->lastFreeBuffer = buf->buf_id;
-		StrategyControl->firstFreeBuffer = buf->buf_id;
-	}
-
-	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-}
-
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
@@ -504,13 +394,6 @@ StrategyInitialize(bool init)
 
 		SpinLockInit(&StrategyControl->buffer_strategy_lock);
 
-		/*
-		 * Grab the whole linked list of free buffers for our strategy. We
-		 * assume it was previously set up by BufferManagerShmemInit().
-		 */
-		StrategyControl->firstFreeBuffer = 0;
-		StrategyControl->lastFreeBuffer = NBuffers - 1;
-
 		/* Initialize the clock-sweep pointer */
 		pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 3a210c710f6..9fcc94ef02d 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -217,8 +217,7 @@ BufMappingPartitionLockByIndex(uint32 index)
  * single atomic variable.  This layout allow us to do some operations in a
  * single atomic operation, without actually acquiring and releasing spinlock;
  * for instance, increase or decrease refcount.  buf_id field never changes
- * after initialization, so does not need locking.  freeNext is protected by
- * the buffer_strategy_lock not buffer header lock.  The LWLock can take care
+ * after initialization, so does not need locking.  The LWLock can take care
  * of itself.  The buffer header lock is *not* used to control access to the
  * data in the buffer!
  *
@@ -264,7 +263,6 @@ typedef struct BufferDesc
 	pg_atomic_uint32 state;
 
 	int			wait_backend_pgprocno;	/* backend of pin-count waiter */
-	int			freeNext;		/* link in freelist chain */
 
 	PgAioWaitRef io_wref;		/* set iff AIO is in progress */
 	LWLock		content_lock;	/* to lock access to buffer contents */
@@ -360,13 +358,6 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc)
 	return (LWLock *) (&bdesc->content_lock);
 }
 
-/*
- * The freeNext field is either the index of the next freelist entry,
- * or one of these special values:
- */
-#define FREENEXT_END_OF_LIST	(-1)
-#define FREENEXT_NOT_IN_LIST	(-2)
-
 /*
  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
  * not apply these to local buffers!
@@ -453,7 +444,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
 
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
-extern bool have_free_buffer(void);
 
 /* buf_table.c */
 extern Size BufTableShmemSize(int size);
-- 
2.49.0

