Hi, In a number of workloads one can see two wait events prominently: LWLock:WALWrite and LWLock:WALInsert. Unfortunately for both that is not very informative:
LWLock:WALWrite can be reported because there is genuine contention on the LWLock, or, more commonly, because several backends are waiting for another to finish IO. In the latter case we are not actually waiting to acquire the lock, we are waiting for the lock to be released *without* then acquiring it. LWLock:WALInsert can be reported because there are not enough WALInsert locks (c.f. NUM_XLOGINSERT_LOCKS) or because we are waiting for another backend to finish copying a WAL record into wal_buffers. In the latter case we are therefore not waiting to acquire an LWLock. I think both of these cases are relevant to distinguish from an operational perspective. Secondarily, I've received many questions about making those locks more scalable / granular, when in most of the cases the issue was not actual lock contention. Today it's surprisingly hard to figure out whether the issue is lock contention or the speed of copying buffers for WAL insert locks / computing the last prc of the CRC checksum. Therefore I'm proposing that LWLockAcquireOrWait() and LWLockWaitForVar() not use the "generic" LWLockReportWaitStart(), but use caller provided wait events. The attached patch adds two new wait events for the existing callers. I waffled a bit about which wait event section to add these to. Ended up with "IPC", but would be happy to change that. WAIT_EVENT_WAL_WAIT_INSERT WALWaitInsert "Waiting for WAL record to be copied into buffers." WAIT_EVENT_WAL_WAIT_WRITE WALWaitWrite "Waiting for WAL buffers to be written or flushed to disk." Previously it was e.g. not really possible to distinguish that something like this: ┌────────────────┬─────────────────┬────────────┬───────┐ │ backend_type │ wait_event_type │ wait_event │ count │ ├────────────────┼─────────────────┼────────────┼───────┤ │ client backend │ LWLock │ WALInsert │ 32 │ │ client backend │ (null) │ (null) │ 9 │ │ walwriter │ IO │ WALWrite │ 1 │ │ client backend │ Client │ ClientRead │ 1 │ │ client backend │ LWLock │ WALWrite │ 1 │ └────────────────┴─────────────────┴────────────┴───────┘ is a workload with a very different bottleneck than this: ┌────────────────┬─────────────────┬───────────────┬───────┐ │ backend_type │ wait_event_type │ wait_event │ count │ ├────────────────┼─────────────────┼───────────────┼───────┤ │ client backend │ IPC │ WALWaitInsert │ 22 │ │ client backend │ LWLock │ WALInsert │ 13 │ │ client backend │ LWLock │ WALBufMapping │ 5 │ │ walwriter │ (null) │ (null) │ 1 │ │ client backend │ Client │ ClientRead │ 1 │ │ client backend │ (null) │ (null) │ 1 │ └────────────────┴─────────────────┴───────────────┴───────┘ even though they are very different FWIW, the former is bottlenecked by the number of WAL insertion locks, the second is bottlenecked by copying WAL into buffers due to needing to flush them. Greetings, Andres Freund
>From 9dcf4a45b6ed7d8fca1a1cd6782f11dff3a84406 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 17 Jul 2023 09:20:05 -0700 Subject: [PATCH v1] Caller specified wait events for LWLockWaitForVar(), LWLockAcquireOrWait() Reporting waits within those functions as LWLock wait events is misleading, as we do not actually wait for the lock themselves. Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch: --- src/include/storage/lwlock.h | 6 ++++-- src/backend/access/transam/xlog.c | 6 ++++-- src/backend/storage/lmgr/lwlock.c | 19 +++++++++++++------ .../utils/activity/wait_event_names.txt | 2 ++ 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 34169e5889e..0f345076f41 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -127,7 +127,8 @@ extern PGDLLIMPORT bool Trace_lwlocks; extern bool LWLockAcquire(LWLock *lock, LWLockMode mode); extern bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode); -extern bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode); +extern bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode, + uint32 wait_event_info); extern void LWLockRelease(LWLock *lock); extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val); extern void LWLockReleaseAll(void); @@ -135,7 +136,8 @@ extern bool LWLockHeldByMe(LWLock *lock); extern bool LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride); extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode); -extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval); +extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval, + uint32 wait_event_info); extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val); extern Size LWLockShmemSize(void); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8b0710abe60..e406634c1cf 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1498,7 +1498,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) */ if (LWLockWaitForVar(&WALInsertLocks[i].l.lock, &WALInsertLocks[i].l.insertingAt, - insertingat, &insertingat)) + insertingat, &insertingat, + WAIT_EVENT_WAL_WAIT_INSERT)) { /* the lock was free, so no insertion in progress */ insertingat = InvalidXLogRecPtr; @@ -2586,7 +2587,8 @@ XLogFlush(XLogRecPtr record) * helps to maintain a good rate of group committing when the system * is bottlenecked by the speed of fsyncing. */ - if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) + if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE, + WAIT_EVENT_WAL_WAIT_WRITE)) { /* * The lock is now free, but we didn't acquire it yet. Before we diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 01d738f306b..549b061b533 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -1418,9 +1418,12 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) * backends as a side-effect. Those other backends need to wait until the * flush finishes, but don't need to acquire the lock anymore. They can just * wake up, observe that their records have already been flushed, and return. + * + * As this is not used to *wait* for lwlock acquisition, the caller has to + * provide a wait event to be reported. */ bool -LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) +LWLockAcquireOrWait(LWLock *lock, LWLockMode mode, uint32 wait_event_info) { PGPROC *proc = MyProc; bool mustwait; @@ -1470,7 +1473,7 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) lwstats->block_count++; #endif - LWLockReportWaitStart(lock); + pgstat_report_wait_start(wait_event_info); if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); @@ -1492,7 +1495,7 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) #endif if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); - LWLockReportWaitEnd(); + pgstat_report_wait_end(); LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened"); } @@ -1605,9 +1608,13 @@ LWLockConflictsWithVar(LWLock *lock, * * Note: this function ignores shared lock holders; if the lock is held * in shared mode, returns 'true'. + * + * As this is not used to wait for lwlocks themselves, the caller has to + * provide a wait event to be reported. */ bool -LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) +LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval, + uint32 wait_event_info) { PGPROC *proc = MyProc; int extraWaits = 0; @@ -1686,7 +1693,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) lwstats->block_count++; #endif - LWLockReportWaitStart(lock); + pgstat_report_wait_start(wait_event_info); if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); @@ -1709,7 +1716,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE); - LWLockReportWaitEnd(); + pgstat_report_wait_end(); LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened"); diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 3fabad96d96..a9358538721 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -141,6 +141,8 @@ WAIT_EVENT_SYNC_REP SyncRep "Waiting for confirmation from a remote server durin WAIT_EVENT_WAL_RECEIVER_EXIT WalReceiverExit "Waiting for the WAL receiver to exit." WAIT_EVENT_WAL_RECEIVER_WAIT_START WalReceiverWaitStart "Waiting for startup process to send initial data for streaming replication." WAIT_EVENT_XACT_GROUP_UPDATE XactGroupUpdate "Waiting for the group leader to update transaction status at end of a parallel operation." +WAIT_EVENT_WAL_WAIT_INSERT WALWaitInsert "Waiting for WAL record to be copied into buffers." +WAIT_EVENT_WAL_WAIT_WRITE WALWaitWrite "Waiting for WAL buffers to be written or flushed to disk." # -- 2.38.0