On Fri, Feb 26, 2021 at 7:08 PM Thomas Munro <thomas.mu...@gmail.com> wrote: > Back in 2016, Robert Haas proposed to replace I/O locks with condition > variables[1]. Condition variables went in and have found lots of > uses, but this patch to replace a bunch of LWLocks and some busy > looping did not. Since then, it has been tested quite a lot as part > of the AIO project[2], which currently depends on it. That's why I'm > interested in following up now. I asked Robert if he planned to > re-propose it and he said I should go for it, so... here I go.
I removed a redundant (Size) cast, fixed the wait event name and category (WAIT_EVENT_BUFFILE_XXX is for buffile.c stuff, not bufmgr.c stuff, and this is really an IPC wait, not an IO wait despite the name), updated documentation and pgindented.
From cb4f3c943c47bb09864723c22cc0504c54dc9a3a Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Fri, 26 Feb 2021 18:01:28 +1300 Subject: [PATCH v2] Replace buffer I/O locks with condition variables. 1. Backends waiting for buffer I/O are now interruptible. 2. If something goes wrong in a backend that is currently performing I/O, other backends no longer wake up until that backend reaches AbortBufferIO() and broadcasts on the CV. Previously, any waiters would wake up (because the I/O lock was automatically released) and then busy-loop until AbortBufferIO() cleared BM_IO_IN_PROGRESS. Author: Robert Haas <robertmh...@gmail.com> Reviewed-by: Tom Lane <t...@sss.pgh.pa.us> (2016) Reviewed-by: Thomas Munro <thomas.mu...@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKGJ8nBFrjLuCTuqKN0pd2PQOwj9b_jnsiGFFMDvUxahj_A%40mail.gmail.com Discussion: https://postgr.es/m/CA+Tgmoaj2aPti0yho7FeEf2qt-JgQPRWb0gci_o1Hfr=c56...@mail.gmail.com --- doc/src/sgml/monitoring.sgml | 8 +-- src/backend/postmaster/pgstat.c | 3 ++ src/backend/storage/buffer/buf_init.c | 19 ++++--- src/backend/storage/buffer/bufmgr.c | 64 +++++++----------------- src/backend/storage/lmgr/lwlock.c | 2 - src/include/pgstat.h | 1 + src/include/storage/buf_internals.h | 7 +-- src/include/storage/condition_variable.h | 11 ++++ src/include/storage/lwlock.h | 1 - src/tools/pgindent/typedefs.list | 1 + 10 files changed, 51 insertions(+), 66 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 3513e127b7..b37d6484a4 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1581,6 +1581,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser <entry>Waiting for the page number needed to continue a parallel B-tree scan to become available.</entry> </row> + <row> + <entry><literal>BufferWaitIO</literal></entry> + <entry>Waiting for buffer I/O to complete.</entry> + </row> <row> <entry><literal>CheckpointDone</literal></entry> <entry>Waiting for a checkpoint to complete.</entry> @@ -1871,10 +1875,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser <entry><literal>BufferContent</literal></entry> <entry>Waiting to access a data page in memory.</entry> </row> - <row> - <entry><literal>BufferIO</literal></entry> - <entry>Waiting for I/O on a data page.</entry> - </row> <row> <entry><literal>BufferMapping</literal></entry> <entry>Waiting to associate a data block with a buffer in the buffer diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index f75b52719d..26cd2ce196 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4011,6 +4011,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_BTREE_PAGE: event_name = "BtreePage"; break; + case WAIT_EVENT_BUFFER_WAIT_IO: + event_name = "BufferWaitIO"; + break; case WAIT_EVENT_CHECKPOINT_DONE: event_name = "CheckpointDone"; break; diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index e9e4f35bb5..51b250fe16 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -19,7 +19,7 @@ BufferDescPadded *BufferDescriptors; char *BufferBlocks; -LWLockMinimallyPadded *BufferIOLWLockArray = NULL; +ConditionVariableMinimallyPadded *BufferIOCVArray = NULL; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; @@ -68,7 +68,7 @@ InitBufferPool(void) { bool foundBufs, foundDescs, - foundIOLocks, + foundIOCV, foundBufCkpt; /* Align descriptors to a cacheline boundary. */ @@ -82,10 +82,10 @@ InitBufferPool(void) NBuffers * (Size) BLCKSZ, &foundBufs); /* Align lwlocks to cacheline boundary */ - BufferIOLWLockArray = (LWLockMinimallyPadded *) - ShmemInitStruct("Buffer IO Locks", - NBuffers * (Size) sizeof(LWLockMinimallyPadded), - &foundIOLocks); + BufferIOCVArray = (ConditionVariableMinimallyPadded *) + ShmemInitStruct("Buffer IO Condition Variables", + NBuffers * sizeof(ConditionVariableMinimallyPadded), + &foundIOCV); /* * The array used to sort to-be-checkpointed buffer ids is located in @@ -98,10 +98,10 @@ InitBufferPool(void) ShmemInitStruct("Checkpoint BufferIds", NBuffers * sizeof(CkptSortItem), &foundBufCkpt); - if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt) + if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) { /* should find all of these, or none of them */ - Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt); + Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); /* note: this path is only taken in EXEC_BACKEND case */ } else @@ -131,8 +131,7 @@ InitBufferPool(void) LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); - LWLockInitialize(BufferDescriptorGetIOLock(buf), - LWTRANCHE_BUFFER_IO); + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } /* Correct last entry of linked list */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 561c212092..5ba6891c64 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1352,9 +1352,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, LWLockRelease(newPartitionLock); /* - * Buffer contents are currently invalid. Try to get the io_in_progress - * lock. If StartBufferIO returns false, then someone else managed to - * read it before we did, so there's nothing left for BufferAlloc() to do. + * Buffer contents are currently invalid. Try to obtain the right to + * start I/O. If StartBufferIO returns false, then someone else managed + * to read it before we did, so there's nothing left for BufferAlloc() to + * do. */ if (StartBufferIO(buf, true)) *foundPtr = false; @@ -1777,9 +1778,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner) */ VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); - /* I'd better not still hold any locks on the buffer */ + /* I'd better not still hold the buffer content lock */ Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); - Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf))); /* * Decrement the shared reference count. @@ -2742,9 +2742,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) uint32 buf_state; /* - * Acquire the buffer's io_in_progress lock. If StartBufferIO returns - * false, then someone else flushed the buffer before we could, so we need - * not do anything. + * Try to start an I/O operation. If StartBufferIO returns false, then + * someone else flushed the buffer before we could, so we need not do + * anything. */ if (!StartBufferIO(buf, false)) return; @@ -2800,7 +2800,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* * Now it's safe to write buffer to disk. Note that no one else should * have been able to write it while we were busy with log flushing because - * we have the io_in_progress lock. + * only one process at a time can set the BM_IO_IN_PROGRESS bit. */ bufBlock = BufHdrGetBlock(buf); @@ -2835,7 +2835,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and - * end the io_in_progress state. + * end the BM_IO_IN_PROGRESS state. */ TerminateBufferIO(buf, true, 0); @@ -4271,7 +4271,7 @@ IsBufferCleanupOK(Buffer buffer) * Functions for buffer I/O handling * * Note: We assume that nested buffer I/O never occurs. - * i.e at most one io_in_progress lock is held per proc. + * i.e at most one BM_IO_IN_PROGRESS bit is set per proc. * * Also note that these are used only for shared buffers, not local ones. */ @@ -4282,13 +4282,9 @@ IsBufferCleanupOK(Buffer buffer) static void WaitIO(BufferDesc *buf) { - /* - * Changed to wait until there's no IO - Inoue 01/13/2000 - * - * Note this is *necessary* because an error abort in the process doing - * I/O could release the io_in_progress_lock prematurely. See - * AbortBufferIO. - */ + ConditionVariable *cv = BufferDescriptorGetIOCV(buf); + + ConditionVariablePrepareToSleep(cv); for (;;) { uint32 buf_state; @@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf) if (!(buf_state & BM_IO_IN_PROGRESS)) break; - LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED); - LWLockRelease(BufferDescriptorGetIOLock(buf)); + ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_WAIT_IO); } + ConditionVariableCancelSleep(); } /* @@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf) * In some scenarios there are race conditions in which multiple backends * could attempt the same I/O operation concurrently. If someone else * has already started I/O on this buffer then we will block on the - * io_in_progress lock until he's done. + * I/O condition variable until he's done. * * Input operations are only attempted on buffers that are not BM_VALID, * and output operations only on buffers that are BM_VALID and BM_DIRTY, @@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput) for (;;) { - /* - * Grab the io_in_progress lock so that other processes can wait for - * me to finish the I/O. - */ - LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE); - buf_state = LockBufHdr(buf); if (!(buf_state & BM_IO_IN_PROGRESS)) break; - - /* - * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress - * lock isn't held is if the process doing the I/O is recovering from - * an error (see AbortBufferIO). If that's the case, we must wait for - * him to get unwedged. - */ UnlockBufHdr(buf, buf_state); - LWLockRelease(BufferDescriptorGetIOLock(buf)); WaitIO(buf); } @@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput) { /* someone else already did the I/O */ UnlockBufHdr(buf, buf_state); - LWLockRelease(BufferDescriptorGetIOLock(buf)); return false; } @@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput) * (Assumptions) * My process is executing IO for the buffer * BM_IO_IN_PROGRESS bit is set for the buffer - * We hold the buffer's io_in_progress lock * The buffer is Pinned * * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the @@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) InProgressBuf = NULL; - LWLockRelease(BufferDescriptorGetIOLock(buf)); + ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf)); } /* @@ -4434,14 +4414,6 @@ AbortBufferIO(void) { uint32 buf_state; - /* - * Since LWLockReleaseAll has already been called, we're not holding - * the buffer's io_in_progress_lock. We have to re-acquire it so that - * we can use TerminateBufferIO. Anyone who's executing WaitIO on the - * buffer will be in a busy spin until we succeed in doing this. - */ - LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE); - buf_state = LockBufHdr(buf); Assert(buf_state & BM_IO_IN_PROGRESS); if (IsForInput) diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 8cb6a6f042..60600d7a0b 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = { "WALInsert", /* LWTRANCHE_BUFFER_CONTENT: */ "BufferContent", - /* LWTRANCHE_BUFFER_IO: */ - "BufferIO", /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */ "ReplicationOriginState", /* LWTRANCHE_REPLICATION_SLOT_IO: */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 724068cf87..e8efbab3ae 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -961,6 +961,7 @@ typedef enum WAIT_EVENT_BGWORKER_SHUTDOWN, WAIT_EVENT_BGWORKER_STARTUP, WAIT_EVENT_BTREE_PAGE, + WAIT_EVENT_BUFFER_WAIT_IO, WAIT_EVENT_CHECKPOINT_DONE, WAIT_EVENT_CHECKPOINT_START, WAIT_EVENT_EXECUTE_GATHER, diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index f6b5782965..532711200a 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -18,6 +18,7 @@ #include "port/atomics.h" #include "storage/buf.h" #include "storage/bufmgr.h" +#include "storage/condition_variable.h" #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/shmem.h" @@ -221,12 +222,12 @@ typedef union BufferDescPadded #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) -#define BufferDescriptorGetIOLock(bdesc) \ - (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock) +#define BufferDescriptorGetIOCV(bdesc) \ + (&(BufferIOCVArray[(bdesc)->buf_id]).cv) #define BufferDescriptorGetContentLock(bdesc) \ ((LWLock*) (&(bdesc)->content_lock)) -extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; +extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; /* * The freeNext field is either the index of the next freelist entry, diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h index 0b7578f8c4..6310ca230a 100644 --- a/src/include/storage/condition_variable.h +++ b/src/include/storage/condition_variable.h @@ -31,6 +31,17 @@ typedef struct proclist_head wakeup; /* list of wake-able processes */ } ConditionVariable; +/* + * Pad a condition variable to a power-of-two size so that an array of + * condition variables does not cross a cache line boundary. + */ +#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32) +typedef union ConditionVariableMinimallyPadded +{ + ConditionVariable cv; + char pad[CV_MINIMAL_SIZE]; +} ConditionVariableMinimallyPadded; + /* Initialize a condition variable. */ extern void ConditionVariableInit(ConditionVariable *cv); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index cbf2510fbf..0ed190f73b 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -202,7 +202,6 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SERIAL_BUFFER, LWTRANCHE_WAL_INSERT, LWTRANCHE_BUFFER_CONTENT, - LWTRANCHE_BUFFER_IO, LWTRANCHE_REPLICATION_ORIGIN_STATE, LWTRANCHE_REPLICATION_SLOT_IO, LWTRANCHE_LOCK_FASTPATH, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 8bd95aefa1..9c3b978c54 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -398,6 +398,7 @@ CompressionAlgorithm CompressorState ComputeXidHorizonsResult ConditionVariable +ConditionVariableMinimallyPadded ConditionalStack ConfigData ConfigVariable -- 2.30.0