On Tue, Mar 25, 2025 at 11:58 AM Andres Freund <and...@anarazel.de> wrote:
>
> > Another thought on complete_shared running on other backends: I wonder if we
> > should push an ErrorContextCallback that adds "CONTEXT: completing I/O of
> > other process" or similar, so people wonder less about how "SELECT FROM a" 
> > led
> > to a log message about IO on table "b".
>
> I've been wondering about that as well, and yes, we probably should.
>
> I'd add the pid of the backend that started the IO to the message - although
> need to check whether we're trying to keep PIDs of other processes from
> unprivileged users.
>
> I think we probably should add a similar, but not equivalent, context in io
> workers. Maybe "I/O worker executing I/O on behalf of process %d".

I think this has not yet been done. Attached patch is an attempt to
add error context for IO completions by another backend when using
io_uring and IO processing in general by an IO worker. It seems to
work -- that is, running the test_aio tests, you can see the context
in the logs.

I'm not certain that I did this in the way you both were envisioning, though.

- Melanie
From 79d7e930de510cad6aef31532b06ba679a72d94a Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplage...@gmail.com>
Date: Sun, 30 Mar 2025 14:03:55 -0400
Subject: [PATCH] Add errcontext for processing I/Os for another backend

Push an ErrorContextCallback adding additional detail about the process
performing the I/O and the owner of the I/O when those are not the same.

For io_method worker, this adds context specifying which process owns
the I/O that the I/O worker is processing.

For io_method io_uring, this adds context only when a backend is
*completing* I/O for another backend. It specifies the pid of the owning
process.

Discussion: https://postgr.es/m/20250325141120.8e.nmisch%40google.com
---
 src/backend/storage/aio/aio.c             |  1 +
 src/backend/storage/aio/method_io_uring.c | 31 +++++++++++++++++++++++
 src/backend/storage/aio/method_worker.c   | 29 +++++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index 43f1e2a7785..3d5cf726e24 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -42,6 +42,7 @@
 #include "port/atomics.h"
 #include "storage/aio_internal.h"
 #include "storage/aio_subsys.h"
+#include "storage/proc.h"
 #include "storage/procnumber.h"
 #include "utils/guc.h"
 #include "utils/guc_hooks.h"
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index 3b299dcf388..244918e1883 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -303,14 +303,41 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 	return num_staged_ios;
 }
 
+static void
+pgaio_uring_completion_error_callback(void *arg)
+{
+	ProcNumber	owner;
+	PGPROC	   *owner_proc;
+	int32		owner_pid;
+	PgAioHandle *ioh = arg;
+
+	if (!ioh)
+		return;
+
+	/* No need for context if a backend is completing the IO for itself */
+	if (ioh->owner_procno == MyProcNumber)
+		return;
+
+	owner = ioh->owner_procno;
+	owner_proc = GetPGProcByNumber(owner);
+	owner_pid = owner_proc->pid;
+
+	errcontext("completing I/O on behalf of process %d", owner_pid);
+}
+
 static void
 pgaio_uring_drain_locked(PgAioUringContext *context)
 {
 	int			ready;
 	int			orig_ready;
+	ErrorContextCallback errcallback = {0};
 
 	Assert(LWLockHeldByMeInMode(&context->completion_lock, LW_EXCLUSIVE));
 
+	errcallback.callback = pgaio_uring_completion_error_callback;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	/*
 	 * Don't drain more events than available right now. Otherwise it's
 	 * plausible that one backend could get stuck, for a while, receiving CQEs
@@ -338,9 +365,11 @@ pgaio_uring_drain_locked(PgAioUringContext *context)
 			PgAioHandle *ioh;
 
 			ioh = io_uring_cqe_get_data(cqe);
+			errcallback.arg = ioh;
 			io_uring_cqe_seen(&context->io_uring_ring, cqe);
 
 			pgaio_io_process_completion(ioh, cqe->res);
+			errcallback.arg = NULL;
 		}
 
 		END_CRIT_SECTION();
@@ -349,6 +378,8 @@ pgaio_uring_drain_locked(PgAioUringContext *context)
 					"drained %d/%d, now expecting %d",
 					ncqes, orig_ready, io_uring_cq_ready(&context->io_uring_ring));
 	}
+
+	error_context_stack = errcallback.previous;
 }
 
 static void
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 5ea00d8a89e..52f901ed4c2 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -361,11 +361,33 @@ pgaio_worker_register(void)
 	on_shmem_exit(pgaio_worker_die, 0);
 }
 
+static void
+pgaio_worker_error_callback(void *arg)
+{
+	ProcNumber	owner;
+	PGPROC	   *owner_proc;
+	int32		owner_pid;
+	PgAioHandle *ioh = arg;
+
+	if (!ioh)
+		return;
+
+	Assert(ioh->owner_procno != MyProcNumber);
+	Assert(MyBackendType == B_IO_WORKER);
+
+	owner = ioh->owner_procno;
+	owner_proc = GetPGProcByNumber(owner);
+	owner_pid = owner_proc->pid;
+
+	errcontext("I/O worker executing I/O on behalf of process %d", owner_pid);
+}
+
 void
 IoWorkerMain(const void *startup_data, size_t startup_data_len)
 {
 	sigjmp_buf	local_sigjmp_buf;
 	PgAioHandle *volatile error_ioh = NULL;
+	ErrorContextCallback errcallback = {0};
 	volatile int error_errno = 0;
 	char		cmd[128];
 
@@ -392,6 +414,10 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 	sprintf(cmd, "%d", MyIoWorkerId);
 	set_ps_display(cmd);
 
+	errcallback.callback = pgaio_worker_error_callback;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	/* see PostgresMain() */
 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
 	{
@@ -475,6 +501,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 
 			ioh = &pgaio_ctl->io_handles[io_index];
 			error_ioh = ioh;
+			errcallback.arg = ioh;
 
 			pgaio_debug_io(DEBUG4, ioh,
 						   "worker %d processing IO",
@@ -515,6 +542,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 			pgaio_io_perform_synchronously(ioh);
 
 			RESUME_INTERRUPTS();
+			errcallback.arg = NULL;
 		}
 		else
 		{
@@ -526,6 +554,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		CHECK_FOR_INTERRUPTS();
 	}
 
+	error_context_stack = errcallback.previous;
 	proc_exit(0);
 }
 
-- 
2.34.1

Reply via email to