New version.  Same code as v2, but comments improved to explain the
reasoning, with reference to README's buffer access rules.  I'm
planning to push this soon if there are no objections.
From 1fa26f407622cd69d82f3b4ea68fddf2c357cf06 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 7 Jun 2024 17:49:19 +1200
Subject: [PATCH v3] Fix RBM_ZERO_AND_LOCK.

Commit 210622c6 accidentally zeroed out pages even if they were found in
the buffer pool.  It should always lock the page, but it should only
zero pages that were not already found as an optimization to avoid I/O.
Otherwise, concurrent readers that hold only a pin might see corrupted
page contents changing under their feet.  This is done with the standard
BM_IO_IN_PROGRESS infrastructure to test for BM_VALID and wake
concurrent waiters without races (as it was in earlier releases).

Reported-by: Noah Misch <n...@leadboat.com>
Reported-by: Alexander Lakhin <exclus...@gmail.com>
Reviewed-by: Alvaro Herrera <alvhe...@alvh.no-ip.org> (earlier version)
Reviewed-by: Robert Haas <robertmh...@gmail.com> (earlier version)
Discussion: https://postgr.es/m/20240512171658.7e.nmi...@google.com
Discussion: https://postgr.es/m/7ed10231-ce47-03d5-d3f9-4aea0dc7d5a4%40gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 64 ++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 49637284f91..a8e3377263f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1010,43 +1010,69 @@ ExtendBufferedRelTo(BufferManagerRelation bmr,
 }
 
 /*
- * Zero a buffer and lock it, as part of the implementation of
+ * Lock and optionally zero a buffer, as part of the implementation of
  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK.  The buffer must be already
- * pinned.  It does not have to be valid, but it is valid and locked on
- * return.
+ * pinned.  If the buffer is not already valid, it is zeroed and made valid.
  */
 static void
-ZeroBuffer(Buffer buffer, ReadBufferMode mode)
+ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
 {
 	BufferDesc *bufHdr;
-	uint32		buf_state;
+	bool		need_to_zero;
 
 	Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
 
-	if (BufferIsLocal(buffer))
+	if (already_valid)
+	{
+		/*
+		 * If the caller already knew the buffer was valid, we can skip some
+		 * header interaction.  The caller just wants to lock the buffer.
+		 */
+		need_to_zero = false;
+	}
+	else if (BufferIsLocal(buffer))
+	{
+		/* Simple case for non-shared buffers. */
 		bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+		need_to_zero = (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
+	}
 	else
 	{
+		/*
+		 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
+		 * concurrently.  Even though we aren't doing I/O, that protocol
+		 * ensures that we don't zero a page that someone else has pinned.  An
+		 * exclusive content lock wouldn't be enough, because readers are
+		 * allowed to drop the content lock after determining that a tuple is
+		 * visible (see buffer access rules in README).
+		 */
 		bufHdr = GetBufferDescriptor(buffer - 1);
+		need_to_zero = StartBufferIO(bufHdr, true, false);
+	}
+
+	if (need_to_zero)
+		memset(BufferGetPage(buffer), 0, BLCKSZ);
+
+	/* Acquire the requested lock before setting BM_VALID. */
+	if (!BufferIsLocal(buffer))
+	{
 		if (mode == RBM_ZERO_AND_LOCK)
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		else
 			LockBufferForCleanup(buffer);
 	}
 
-	memset(BufferGetPage(buffer), 0, BLCKSZ);
-
-	if (BufferIsLocal(buffer))
-	{
-		buf_state = pg_atomic_read_u32(&bufHdr->state);
-		buf_state |= BM_VALID;
-		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
-	}
-	else
+	/*
+	 * Set BM_VALID, and wake anyone waiting for BM_IO_IN_PROGRESS to be
+	 * cleared.
+	 */
+	if (need_to_zero)
 	{
-		buf_state = LockBufHdr(bufHdr);
-		buf_state |= BM_VALID;
-		UnlockBufHdr(bufHdr, buf_state);
+		if (BufferIsLocal(buffer))
+			pg_atomic_write_u32(&bufHdr->state,
+								pg_atomic_read_u32(&bufHdr->state) | BM_VALID);
+		else
+			TerminateBufferIO(bufHdr, false, BM_VALID, true);
 	}
 }
 
@@ -1185,7 +1211,7 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
 
 		buffer = PinBufferForBlock(rel, smgr, smgr_persistence,
 								   forkNum, blockNum, strategy, &found);
-		ZeroBuffer(buffer, mode);
+		ZeroAndLockBuffer(buffer, mode, found);
 		return buffer;
 	}
 
-- 
2.45.1

Reply via email to