Hello hackers,

Back in 2016, Robert Haas proposed to replace I/O locks with condition
variables[1].  Condition variables went in and have found lots of
uses, but this patch to replace a bunch of LWLocks and some busy
looping did not.  Since then, it has been tested quite a lot as part
of the AIO project[2], which currently depends on it.  That's why I'm
interested in following up now.  I asked Robert if he planned to
re-propose it and he said I should go for it, so... here I go.

At the time, Tom Lane said:

> Hmm.  I fear the only reason you see an advantage there is that you don't
> (yet) have any general-purpose mechanism for an aborting transaction to
> satisfy its responsibilities vis-a-vis waiters on condition variables.
> Instead, this wins specifically because you stuck some bespoke logic into
> AbortBufferIO.  OK ... but that sounds like we're going to end up with
> every single condition variable that ever exists in the system needing to
> be catered for separately and explicitly during transaction abort cleanup.
> Which does not sound promising from a reliability standpoint.  On the
> other hand, I don't know what the equivalent rule to "release all LWLocks
> during abort" might look like for condition variables, so I don't know
> if it's even possible to avoid that.

It's true that cases like this one need bespoke logic, but that was
already the case: you have to make sure you call TerminateBufferIO()
as before, it's just that BM_IO_IN_PROGRESS-clearing is now a
CV-broadcastable event.  That seems reasonable to me.  As for the more
general point about the danger of waiting on CVs when potential
broadcasters might abort, and with the considerable benefit of a few
years of hindsight:  I think the existing users of CVs mostly fall
into the category of waiters that will be shut down by a higher
authority if the expected broadcaster aborts.  Examples: Parallel
query's interrupt-based error system will abort every back end waiting
at a parallel hash join barrier if any process involved in the query
aborts, and the whole cluster will be shut down if you're waiting for
a checkpoint when the checkpointer dies.

It looks like there may be a nearby opportunity to improve another
(rare?) busy loop, when InvalidateBuffer() encounters a pinned buffer,
based on this comment:

     * ...  Note that if the other guy has pinned the buffer but not
     * yet done StartBufferIO, WaitIO will fall through and we'll effectively
     * be busy-looping here.)

[1] 
https://www.postgresql.org/message-id/flat/CA%2BTgmoaj2aPti0yho7FeEf2qt-JgQPRWb0gci_o1Hfr%3DC56Xng%40mail.gmail.com
[2] 
https://www.postgresql.org/message-id/flat/20210223100344.llw5an2aklengrmn%40alap3.anarazel.de
From 64af4b141b8b96cd18b4c9afa700880cd4d01d87 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 26 Feb 2021 18:01:28 +1300
Subject: [PATCH] Replace buffer I/O locks with condition variables.

1.  Backends waiting for buffer I/O are now interruptible.

2.  If something goes wrong in a backend that is currently performing
I/O, other backends no longer wake up until that backend reaches
AbortBufferIO() and broadcasts on the CV.  Previously, any waiters would
wake up (because the I/O lock was automatically released) and then
busy-loop until AbortBufferIO() cleared BM_IO_IN_PROGRESS.

Author: Robert Haas <robertmh...@gmail.com>
Discussion: https://postgr.es/m/CA+Tgmoaj2aPti0yho7FeEf2qt-JgQPRWb0gci_o1Hfr=c56...@mail.gmail.com
---
 src/backend/postmaster/pgstat.c          |  3 ++
 src/backend/storage/buffer/buf_init.c    | 19 ++++----
 src/backend/storage/buffer/bufmgr.c      | 62 +++++++-----------------
 src/backend/storage/lmgr/lwlock.c        |  2 -
 src/include/pgstat.h                     |  1 +
 src/include/storage/buf_internals.h      |  7 +--
 src/include/storage/condition_variable.h | 11 +++++
 src/include/storage/lwlock.h             |  1 -
 8 files changed, 45 insertions(+), 61 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index f75b52719d..d399b9c3e5 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4190,6 +4190,9 @@ pgstat_get_wait_io(WaitEventIO w)
 		case WAIT_EVENT_BUFFILE_TRUNCATE:
 			event_name = "BufFileTruncate";
 			break;
+		case WAIT_EVENT_BUFFILE_WAITIO:
+			event_name = "BufFileWaitIO";
+			break;
 		case WAIT_EVENT_CONTROL_FILE_READ:
 			event_name = "ControlFileRead";
 			break;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index e9e4f35bb5..850ef8af4f 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -19,7 +19,7 @@
 
 BufferDescPadded *BufferDescriptors;
 char	   *BufferBlocks;
-LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
+ConditionVariableMinimallyPadded *BufferIOCVArray = NULL;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
 
@@ -68,7 +68,7 @@ InitBufferPool(void)
 {
 	bool		foundBufs,
 				foundDescs,
-				foundIOLocks,
+				foundIOCV,
 				foundBufCkpt;
 
 	/* Align descriptors to a cacheline boundary. */
@@ -82,10 +82,10 @@ InitBufferPool(void)
 						NBuffers * (Size) BLCKSZ, &foundBufs);
 
 	/* Align lwlocks to cacheline boundary */
-	BufferIOLWLockArray = (LWLockMinimallyPadded *)
-		ShmemInitStruct("Buffer IO Locks",
-						NBuffers * (Size) sizeof(LWLockMinimallyPadded),
-						&foundIOLocks);
+	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
+		ShmemInitStruct("Buffer IO Condition Variables",
+			  NBuffers * (Size) sizeof(ConditionVariableMinimallyPadded),
+									   &foundIOCV);
 
 	/*
 	 * The array used to sort to-be-checkpointed buffer ids is located in
@@ -98,10 +98,10 @@ InitBufferPool(void)
 		ShmemInitStruct("Checkpoint BufferIds",
 						NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
 
-	if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
+	if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
 	{
 		/* should find all of these, or none of them */
-		Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
+		Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
 		/* note: this path is only taken in EXEC_BACKEND case */
 	}
 	else
@@ -131,8 +131,7 @@ InitBufferPool(void)
 			LWLockInitialize(BufferDescriptorGetContentLock(buf),
 							 LWTRANCHE_BUFFER_CONTENT);
 
-			LWLockInitialize(BufferDescriptorGetIOLock(buf),
-							 LWTRANCHE_BUFFER_IO);
+			ConditionVariableInit(BufferDescriptorGetIOCV(buf));
 		}
 
 		/* Correct last entry of linked list */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 561c212092..a3ec89707b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1352,8 +1352,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	LWLockRelease(newPartitionLock);
 
 	/*
-	 * Buffer contents are currently invalid.  Try to get the io_in_progress
-	 * lock.  If StartBufferIO returns false, then someone else managed to
+	 * Buffer contents are currently invalid.  Try to obtain the right to start
+	 * I/O.  If StartBufferIO returns false, then someone else managed to
 	 * read it before we did, so there's nothing left for BufferAlloc() to do.
 	 */
 	if (StartBufferIO(buf, true))
@@ -1777,9 +1777,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 		 */
 		VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
 
-		/* I'd better not still hold any locks on the buffer */
+		/* I'd better not still hold the buffer content lock */
 		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
-		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
 
 		/*
 		 * Decrement the shared reference count.
@@ -2742,9 +2741,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	uint32		buf_state;
 
 	/*
-	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
-	 * false, then someone else flushed the buffer before we could, so we need
-	 * not do anything.
+	 * Try to start an I/O operation.  If StartBufferIO returns false, then
+	 * someone else flushed the buffer before we could, so we need not do
+	 * anything.
 	 */
 	if (!StartBufferIO(buf, false))
 		return;
@@ -2800,7 +2799,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 	/*
 	 * Now it's safe to write buffer to disk. Note that no one else should
 	 * have been able to write it while we were busy with log flushing because
-	 * we have the io_in_progress lock.
+	 * only one process at a time can set the BM_IO_IN_PROGRESS bit.
 	 */
 	bufBlock = BufHdrGetBlock(buf);
 
@@ -2835,7 +2834,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 
 	/*
 	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
-	 * end the io_in_progress state.
+	 * end the BM_IO_IN_PROGRESS state.
 	 */
 	TerminateBufferIO(buf, true, 0);
 
@@ -4271,7 +4270,7 @@ IsBufferCleanupOK(Buffer buffer)
  *	Functions for buffer I/O handling
  *
  *	Note: We assume that nested buffer I/O never occurs.
- *	i.e at most one io_in_progress lock is held per proc.
+ *	i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
  *
  *	Also note that these are used only for shared buffers, not local ones.
  */
@@ -4282,13 +4281,10 @@ IsBufferCleanupOK(Buffer buffer)
 static void
 WaitIO(BufferDesc *buf)
 {
-	/*
-	 * Changed to wait until there's no IO - Inoue 01/13/2000
-	 *
-	 * Note this is *necessary* because an error abort in the process doing
-	 * I/O could release the io_in_progress_lock prematurely. See
-	 * AbortBufferIO.
-	 */
+	ConditionVariable	*cv = BufferDescriptorGetIOCV(buf);
+
+	ConditionVariablePrepareToSleep(cv);
+
 	for (;;)
 	{
 		uint32		buf_state;
@@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf)
 
 		if (!(buf_state & BM_IO_IN_PROGRESS))
 			break;
-		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
-		LWLockRelease(BufferDescriptorGetIOLock(buf));
+		ConditionVariableSleep(cv, WAIT_EVENT_BUFFILE_WAITIO);
 	}
+	ConditionVariableCancelSleep();
 }
 
 /*
@@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf)
  * In some scenarios there are race conditions in which multiple backends
  * could attempt the same I/O operation concurrently.  If someone else
  * has already started I/O on this buffer then we will block on the
- * io_in_progress lock until he's done.
+ * I/O condition variable until he's done.
  *
  * Input operations are only attempted on buffers that are not BM_VALID,
  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
@@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 
 	for (;;)
 	{
-		/*
-		 * Grab the io_in_progress lock so that other processes can wait for
-		 * me to finish the I/O.
-		 */
-		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
-
 		buf_state = LockBufHdr(buf);
 
 		if (!(buf_state & BM_IO_IN_PROGRESS))
 			break;
-
-		/*
-		 * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
-		 * lock isn't held is if the process doing the I/O is recovering from
-		 * an error (see AbortBufferIO).  If that's the case, we must wait for
-		 * him to get unwedged.
-		 */
 		UnlockBufHdr(buf, buf_state);
-		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		WaitIO(buf);
 	}
 
@@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 	{
 		/* someone else already did the I/O */
 		UnlockBufHdr(buf, buf_state);
-		LWLockRelease(BufferDescriptorGetIOLock(buf));
 		return false;
 	}
 
@@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
  *	(Assumptions)
  *	My process is executing IO for the buffer
  *	BM_IO_IN_PROGRESS bit is set for the buffer
- *	We hold the buffer's io_in_progress lock
  *	The buffer is Pinned
  *
  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
@@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
 	InProgressBuf = NULL;
 
-	LWLockRelease(BufferDescriptorGetIOLock(buf));
+	ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
 }
 
 /*
@@ -4434,14 +4414,6 @@ AbortBufferIO(void)
 	{
 		uint32		buf_state;
 
-		/*
-		 * Since LWLockReleaseAll has already been called, we're not holding
-		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
-		 * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
-		 * buffer will be in a busy spin until we succeed in doing this.
-		 */
-		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
-
 		buf_state = LockBufHdr(buf);
 		Assert(buf_state & BM_IO_IN_PROGRESS);
 		if (IsForInput)
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 8cb6a6f042..60600d7a0b 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = {
 	"WALInsert",
 	/* LWTRANCHE_BUFFER_CONTENT: */
 	"BufferContent",
-	/* LWTRANCHE_BUFFER_IO: */
-	"BufferIO",
 	/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
 	"ReplicationOriginState",
 	/* LWTRANCHE_REPLICATION_SLOT_IO: */
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 724068cf87..6adf542e22 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1028,6 +1028,7 @@ typedef enum
 	WAIT_EVENT_BUFFILE_READ,
 	WAIT_EVENT_BUFFILE_WRITE,
 	WAIT_EVENT_BUFFILE_TRUNCATE,
+	WAIT_EVENT_BUFFILE_WAITIO,
 	WAIT_EVENT_CONTROL_FILE_READ,
 	WAIT_EVENT_CONTROL_FILE_SYNC,
 	WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index f6b5782965..532711200a 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -18,6 +18,7 @@
 #include "port/atomics.h"
 #include "storage/buf.h"
 #include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
 #include "storage/latch.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
@@ -221,12 +222,12 @@ typedef union BufferDescPadded
 
 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
 
-#define BufferDescriptorGetIOLock(bdesc) \
-	(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
+#define BufferDescriptorGetIOCV(bdesc) \
+	(&(BufferIOCVArray[(bdesc)->buf_id]).cv)
 #define BufferDescriptorGetContentLock(bdesc) \
 	((LWLock*) (&(bdesc)->content_lock))
 
-extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
+extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
 
 /*
  * The freeNext field is either the index of the next freelist entry,
diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h
index 0b7578f8c4..4cba0eccff 100644
--- a/src/include/storage/condition_variable.h
+++ b/src/include/storage/condition_variable.h
@@ -31,6 +31,17 @@ typedef struct
 	proclist_head wakeup;		/* list of wake-able processes */
 } ConditionVariable;
 
+/*
+ * Pad a condition variable to a power-of-two size so that an array of
+ * condition variables does not cross a cache line boundary.
+ */
+#define CV_MINIMAL_SIZE		(sizeof(ConditionVariable) <= 16 ? 16 : 32)
+typedef union ConditionVariableMinimallyPadded
+{
+	ConditionVariable	cv;
+	char		pad[CV_MINIMAL_SIZE];
+} ConditionVariableMinimallyPadded;
+
 /* Initialize a condition variable. */
 extern void ConditionVariableInit(ConditionVariable *cv);
 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index cbf2510fbf..0ed190f73b 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -202,7 +202,6 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_SERIAL_BUFFER,
 	LWTRANCHE_WAL_INSERT,
 	LWTRANCHE_BUFFER_CONTENT,
-	LWTRANCHE_BUFFER_IO,
 	LWTRANCHE_REPLICATION_ORIGIN_STATE,
 	LWTRANCHE_REPLICATION_SLOT_IO,
 	LWTRANCHE_LOCK_FASTPATH,
-- 
2.30.0

Reply via email to