Report distinct wait events when waiting for WAL "operation"

Andres Freund Mon, 17 Jul 2023 09:56:05 -0700

Hi,

In a number of workloads one can see two wait events prominently:
LWLock:WALWrite and LWLock:WALInsert. Unfortunately for both that is not very
informative:


LWLock:WALWrite can be reported because there is genuine contention on the
LWLock, or, more commonly, because several backends are waiting for another to
finish IO. In the latter case we are not actually waiting to acquire the lock,
we are waiting for the lock to be released *without* then acquiring it.

LWLock:WALInsert can be reported because there are not enough WALInsert locks
(c.f. NUM_XLOGINSERT_LOCKS) or because we are waiting for another backend to
finish copying a WAL record into wal_buffers.  In the latter case we are
therefore not waiting to acquire an LWLock.


I think both of these cases are relevant to distinguish from an operational
perspective. Secondarily, I've received many questions about making those
locks more scalable / granular, when in most of the cases the issue was not
actual lock contention.

Today it's surprisingly hard to figure out whether the issue is lock
contention or the speed of copying buffers for WAL insert locks / computing
the last prc of the CRC checksum.


Therefore I'm proposing that LWLockAcquireOrWait() and LWLockWaitForVar() not
use the "generic" LWLockReportWaitStart(), but use caller provided wait
events.  The attached patch adds two new wait events for the existing callers.

I waffled a bit about which wait event section to add these to. Ended up with
"IPC", but would be happy to change that.

WAIT_EVENT_WAL_WAIT_INSERT  WALWaitInsert   "Waiting for WAL record to be 
copied into buffers."
WAIT_EVENT_WAL_WAIT_WRITE   WALWaitWrite    "Waiting for WAL buffers to be 
written or flushed to disk."


Previously it was e.g. not really possible to distinguish that something like
this:

┌────────────────┬─────────────────┬────────────┬───────┐
│  backend_type  │ wait_event_type │ wait_event │ count │
├────────────────┼─────────────────┼────────────┼───────┤
│ client backend │ LWLock          │ WALInsert  │    32 │
│ client backend │ (null)          │ (null)     │     9 │
│ walwriter      │ IO              │ WALWrite   │     1 │
│ client backend │ Client          │ ClientRead │     1 │
│ client backend │ LWLock          │ WALWrite   │     1 │
└────────────────┴─────────────────┴────────────┴───────┘

is a workload with a very different bottleneck than this:

┌────────────────┬─────────────────┬───────────────┬───────┐
│  backend_type  │ wait_event_type │  wait_event   │ count │
├────────────────┼─────────────────┼───────────────┼───────┤
│ client backend │ IPC             │ WALWaitInsert │    22 │
│ client backend │ LWLock          │ WALInsert     │    13 │
│ client backend │ LWLock          │ WALBufMapping │     5 │
│ walwriter      │ (null)          │ (null)        │     1 │
│ client backend │ Client          │ ClientRead    │     1 │
│ client backend │ (null)          │ (null)        │     1 │
└────────────────┴─────────────────┴───────────────┴───────┘

even though they are very different

FWIW, the former is bottlenecked by the number of WAL insertion locks, the
second is bottlenecked by copying WAL into buffers due to needing to flush
them.

Greetings,

Andres Freund

>From 9dcf4a45b6ed7d8fca1a1cd6782f11dff3a84406 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 17 Jul 2023 09:20:05 -0700
Subject: [PATCH v1] Caller specified wait events for LWLockWaitForVar(),
 LWLockAcquireOrWait()

Reporting waits within those functions as LWLock wait events is misleading, as
we do not actually wait for the lock themselves.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/lwlock.h                  |  6 ++++--
 src/backend/access/transam/xlog.c             |  6 ++++--
 src/backend/storage/lmgr/lwlock.c             | 19 +++++++++++++------
 .../utils/activity/wait_event_names.txt       |  2 ++
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 34169e5889e..0f345076f41 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -127,7 +127,8 @@ extern PGDLLIMPORT bool Trace_lwlocks;
 
 extern bool LWLockAcquire(LWLock *lock, LWLockMode mode);
 extern bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode);
-extern bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode);
+extern bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode,
+								uint32 wait_event_info);
 extern void LWLockRelease(LWLock *lock);
 extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
 extern void LWLockReleaseAll(void);
@@ -135,7 +136,8 @@ extern bool LWLockHeldByMe(LWLock *lock);
 extern bool LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride);
 extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
 
-extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
+extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval,
+							 uint32 wait_event_info);
 extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val);
 
 extern Size LWLockShmemSize(void);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8b0710abe60..e406634c1cf 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1498,7 +1498,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto)
 			 */
 			if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
 								 &WALInsertLocks[i].l.insertingAt,
-								 insertingat, &insertingat))
+								 insertingat, &insertingat,
+								 WAIT_EVENT_WAL_WAIT_INSERT))
 			{
 				/* the lock was free, so no insertion in progress */
 				insertingat = InvalidXLogRecPtr;
@@ -2586,7 +2587,8 @@ XLogFlush(XLogRecPtr record)
 		 * helps to maintain a good rate of group committing when the system
 		 * is bottlenecked by the speed of fsyncing.
 		 */
-		if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
+		if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE,
+								 WAIT_EVENT_WAL_WAIT_WRITE))
 		{
 			/*
 			 * The lock is now free, but we didn't acquire it yet. Before we
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 01d738f306b..549b061b533 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -1418,9 +1418,12 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
  * backends as a side-effect.  Those other backends need to wait until the
  * flush finishes, but don't need to acquire the lock anymore.  They can just
  * wake up, observe that their records have already been flushed, and return.
+ *
+ * As this is not used to *wait* for lwlock acquisition, the caller has to
+ * provide a wait event to be reported.
  */
 bool
-LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
+LWLockAcquireOrWait(LWLock *lock, LWLockMode mode, uint32 wait_event_info)
 {
 	PGPROC	   *proc = MyProc;
 	bool		mustwait;
@@ -1470,7 +1473,7 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 			lwstats->block_count++;
 #endif
 
-			LWLockReportWaitStart(lock);
+			pgstat_report_wait_start(wait_event_info);
 			if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
 				TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
 
@@ -1492,7 +1495,7 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 #endif
 			if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
 				TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
-			LWLockReportWaitEnd();
+			pgstat_report_wait_end();
 
 			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
 		}
@@ -1605,9 +1608,13 @@ LWLockConflictsWithVar(LWLock *lock,
  *
  * Note: this function ignores shared lock holders; if the lock is held
  * in shared mode, returns 'true'.
+ *
+ * As this is not used to wait for lwlocks themselves, the caller has to
+ * provide a wait event to be reported.
  */
 bool
-LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
+LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval,
+				 uint32 wait_event_info)
 {
 	PGPROC	   *proc = MyProc;
 	int			extraWaits = 0;
@@ -1686,7 +1693,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 		lwstats->block_count++;
 #endif
 
-		LWLockReportWaitStart(lock);
+		pgstat_report_wait_start(wait_event_info);
 		if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
 			TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
 
@@ -1709,7 +1716,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 
 		if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
 			TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE);
-		LWLockReportWaitEnd();
+		pgstat_report_wait_end();
 
 		LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened");
 
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 3fabad96d96..a9358538721 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -141,6 +141,8 @@ WAIT_EVENT_SYNC_REP	SyncRep	"Waiting for confirmation from a remote server durin
 WAIT_EVENT_WAL_RECEIVER_EXIT	WalReceiverExit	"Waiting for the WAL receiver to exit."
 WAIT_EVENT_WAL_RECEIVER_WAIT_START	WalReceiverWaitStart	"Waiting for startup process to send initial data for streaming replication."
 WAIT_EVENT_XACT_GROUP_UPDATE	XactGroupUpdate	"Waiting for the group leader to update transaction status at end of a parallel operation."
+WAIT_EVENT_WAL_WAIT_INSERT	WALWaitInsert	"Waiting for WAL record to be copied into buffers."
+WAIT_EVENT_WAL_WAIT_WRITE	WALWaitWrite	"Waiting for WAL buffers to be written or flushed to disk."
 
 
 #
-- 
2.38.0

Report distinct wait events when waiting for WAL "operation"

Reply via email to