On Fri, Jun 24, 2022 at 11:45:22AM +0100, Simon Riggs wrote:
> On Thu, 23 Jun 2022 at 18:15, Nathan Bossart <nathandboss...@gmail.com> wrote:
>> I'm grateful for the discussion in this thread so far, but I'm not seeing a
>> clear path forward.
> 
> +1 to add the new auxiliary process.

I went ahead and put together a new patch set for this in which I've
attempted to address most of the feedback from upthread.  Notably, I've
abandoned 0007 and 0008, added a way for processes to request specific
tasks for the custodian, and removed all the checks for
ShutdownRequestPending.

I haven't addressed the existing transaction ID wraparound risk with the
logical replication files.  My instinct is that this deserveѕ its own
thread, and it might need to be considered a prerequisite to this change
based on the prior discussion here.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 68e3005c14ba116e372a1724dad079914108ab2d Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossa...@amazon.com>
Date: Wed, 5 Jan 2022 19:24:22 +0000
Subject: [PATCH v6 1/6] Introduce custodian.

The custodian process is a new auxiliary process that is intended
to help offload tasks could otherwise delay startup and
checkpointing.  This commit simply adds the new process; it does
not yet do anything useful.
---
 src/backend/postmaster/Makefile         |   1 +
 src/backend/postmaster/auxprocess.c     |   8 +
 src/backend/postmaster/custodian.c      | 252 ++++++++++++++++++++++++
 src/backend/postmaster/postmaster.c     |  44 ++++-
 src/backend/storage/ipc/ipci.c          |   3 +
 src/backend/storage/lmgr/proc.c         |   1 +
 src/backend/utils/activity/wait_event.c |   3 +
 src/backend/utils/init/miscinit.c       |   3 +
 src/include/miscadmin.h                 |   3 +
 src/include/postmaster/custodian.h      |  20 ++
 src/include/storage/proc.h              |  11 +-
 src/include/utils/wait_event.h          |   1 +
 12 files changed, 345 insertions(+), 5 deletions(-)
 create mode 100644 src/backend/postmaster/custodian.c
 create mode 100644 src/include/postmaster/custodian.h

diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 3a794e54d6..e1e1d1123f 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -18,6 +18,7 @@ OBJS = \
 	bgworker.o \
 	bgwriter.o \
 	checkpointer.o \
+	custodian.o \
 	fork_process.o \
 	interrupt.o \
 	pgarch.o \
diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c
index 39ac4490db..620a0b1bae 100644
--- a/src/backend/postmaster/auxprocess.c
+++ b/src/backend/postmaster/auxprocess.c
@@ -20,6 +20,7 @@
 #include "pgstat.h"
 #include "postmaster/auxprocess.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/custodian.h"
 #include "postmaster/startup.h"
 #include "postmaster/walwriter.h"
 #include "replication/walreceiver.h"
@@ -74,6 +75,9 @@ AuxiliaryProcessMain(AuxProcType auxtype)
 		case CheckpointerProcess:
 			MyBackendType = B_CHECKPOINTER;
 			break;
+		case CustodianProcess:
+			MyBackendType = B_CUSTODIAN;
+			break;
 		case WalWriterProcess:
 			MyBackendType = B_WAL_WRITER;
 			break;
@@ -153,6 +157,10 @@ AuxiliaryProcessMain(AuxProcType auxtype)
 			CheckpointerMain();
 			proc_exit(1);
 
+		case CustodianProcess:
+			CustodianMain();
+			proc_exit(1);
+
 		case WalWriterProcess:
 			WalWriterMain();
 			proc_exit(1);
diff --git a/src/backend/postmaster/custodian.c b/src/backend/postmaster/custodian.c
new file mode 100644
index 0000000000..db00282658
--- /dev/null
+++ b/src/backend/postmaster/custodian.c
@@ -0,0 +1,252 @@
+/*-------------------------------------------------------------------------
+ *
+ * custodian.c
+ *
+ * The custodian process handles a variety of non-critical tasks that might
+ * otherwise delay startup, checkpointing, etc.  Offloaded tasks should not
+ * be synchronous (e.g., checkpointing shouldn't wait for the custodian to
+ * complete a task before proceeding).  Also, ensure that any offloaded
+ * tasks are either not required during single-user mode or are performed
+ * separately during single-user mode.
+ *
+ * The custodian is not an essential process and can shutdown quickly when
+ * requested.  The custodian will wake up approximately once every 5
+ * minutes to perform its tasks, but backends can (and should) set its
+ * latch to wake it up sooner.
+ *
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *   src/backend/postmaster/custodian.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <time.h>
+
+#include "libpq/pqsignal.h"
+#include "pgstat.h"
+#include "postmaster/custodian.h"
+#include "postmaster/interrupt.h"
+#include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
+#include "storage/fd.h"
+#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+
+#define CUSTODIAN_TIMEOUT_S (300)		/* 5 minutes */
+
+typedef struct
+{
+	slock_t		cust_lck;
+	int			cust_flags;
+} CustodianShmemStruct;
+
+static CustodianShmemStruct *CustodianShmem;
+
+/*
+ * Main entry point for custodian process
+ *
+ * This is invoked from AuxiliaryProcessMain, which has already created the
+ * basic execution environment, but not enabled signals yet.
+ */
+void
+CustodianMain(void)
+{
+	sigjmp_buf	local_sigjmp_buf;
+	MemoryContext custodian_context;
+
+	/*
+	 * Properly accept or ignore signals that might be sent to us.
+	 */
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGINT, SignalHandlerForShutdownRequest);
+	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
+	/* SIGQUIT handler was already set up by InitPostmasterChild */
+	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGUSR2, SIG_IGN);
+
+	/*
+	 * Reset some signals that are accepted by postmaster but not here
+	 */
+	pqsignal(SIGCHLD, SIG_DFL);
+
+	/*
+	 * Create a memory context that we will do all our work in.  We do this so
+	 * that we can reset the context during error recovery and thereby avoid
+	 * possible memory leaks.
+	 */
+	custodian_context = AllocSetContextCreate(TopMemoryContext,
+											  "Custodian",
+											  ALLOCSET_DEFAULT_SIZES);
+	MemoryContextSwitchTo(custodian_context);
+
+	/*
+	 * If an exception is encountered, processing resumes here.  As with other
+	 * auxiliary processes, we cannot use PG_TRY because this is the bottom of
+	 * the exception stack.
+	 */
+	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+	{
+		/* Since not using PG_TRY, must reset error stack by hand */
+		error_context_stack = NULL;
+
+		/* Prevent interrupts while cleaning up */
+		HOLD_INTERRUPTS();
+
+		/* Report the error to the server log */
+		EmitErrorReport();
+
+		/*
+		 * These operations are really just a minimal subset of
+		 * AbortTransaction().  We don't have very many resources to worry
+		 * about.
+		 */
+		LWLockReleaseAll();
+		ConditionVariableCancelSleep();
+		AbortBufferIO();
+		UnlockBuffers();
+		ReleaseAuxProcessResources(false);
+		AtEOXact_Buffers(false);
+		AtEOXact_SMgr();
+		AtEOXact_Files(false);
+		AtEOXact_HashTables(false);
+
+		/*
+		 * Now return to normal top-level context and clear ErrorContext for
+		 * next time.
+		 */
+		MemoryContextSwitchTo(custodian_context);
+		FlushErrorState();
+
+		/* Flush any leaked data in the top-level context */
+		MemoryContextResetAndDeleteChildren(custodian_context);
+
+		/* Now we can allow interrupts again */
+		RESUME_INTERRUPTS();
+
+		/*
+		 * Sleep at least 1 second after any error.  A write error is likely
+		 * to be repeated, and we don't want to be filling the error logs as
+		 * fast as we can.
+		 */
+		pg_usleep(1000000L);
+
+		/*
+		 * Close all open files after any error.  This is helpful on Windows,
+		 * where holding deleted files open causes various strange errors.
+		 * It's not clear we need it elsewhere, but shouldn't hurt.
+		 */
+		smgrcloseall();
+
+		/* Report wait end here, when there is no further possibility of wait */
+		pgstat_report_wait_end();
+	}
+
+	/* We can now handle ereport(ERROR) */
+	PG_exception_stack = &local_sigjmp_buf;
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
+	 * Advertise out latch that backends can use to wake us up while we're
+	 * sleeping.
+	 */
+	ProcGlobal->custodianLatch = &MyProc->procLatch;
+
+	/*
+	 * On startup and after an exception, we won't know exactly what tasks need
+	 * to be performed, so request all of them.
+	 */
+	SpinLockAcquire(&CustodianShmem->cust_lck);
+	CustodianShmem->cust_flags = 0xFFFFFFFF;
+	SpinLockRelease(&CustodianShmem->cust_lck);
+
+	/*
+	 * Loop forever
+	 */
+	for (;;)
+	{
+		pg_time_t	start_time;
+		pg_time_t	end_time;
+		int			elapsed_secs;
+		int			cur_timeout;
+		int			flags;
+
+		/* Clear any already-pending wakeups */
+		ResetLatch(MyLatch);
+
+		HandleMainLoopInterrupts();
+
+		start_time = (pg_time_t) time(NULL);
+
+		/* Obtain requested tasks */
+		SpinLockAcquire(&CustodianShmem->cust_lck);
+		flags = CustodianShmem->cust_flags;
+		CustodianShmem->cust_flags = 0;
+		SpinLockRelease(&CustodianShmem->cust_lck);
+
+		/* TODO: offloaded tasks go here */
+
+		/* Calculate how long to sleep */
+		end_time = (pg_time_t) time(NULL);
+		elapsed_secs = end_time - start_time;
+		if (elapsed_secs >= CUSTODIAN_TIMEOUT_S)
+			continue;			/* no sleep for us */
+		cur_timeout = CUSTODIAN_TIMEOUT_S - elapsed_secs;
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+						 cur_timeout * 1000L /* convert to ms */ ,
+						 WAIT_EVENT_CUSTODIAN_MAIN);
+	}
+
+	pg_unreachable();
+}
+
+Size
+CustodianShmemSize(void)
+{
+	return sizeof(CustodianShmemStruct);
+}
+
+void
+CustodianShmemInit(void)
+{
+	Size		size = CustodianShmemSize();
+	bool		found;
+
+	CustodianShmem = (CustodianShmemStruct *)
+		ShmemInitStruct("Custodian Data", size, &found);
+
+	if (!found)
+	{
+		memset(CustodianShmem, 0, size);
+		SpinLockInit(&CustodianShmem->cust_lck);
+	}
+}
+
+/*
+ * RequestCustodian
+ *		Called to request a custodian task.
+ */
+void
+RequestCustodian(int flags)
+{
+	SpinLockAcquire(&CustodianShmem->cust_lck);
+	CustodianShmem->cust_flags |= flags;
+	SpinLockRelease(&CustodianShmem->cust_lck);
+
+	if (ProcGlobal->custodianLatch)
+		SetLatch(ProcGlobal->custodianLatch);
+}
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index dde4bc25b1..5162ee9dec 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -251,6 +251,7 @@ bool		remove_temp_files_after_crash = true;
 static pid_t StartupPID = 0,
 			BgWriterPID = 0,
 			CheckpointerPID = 0,
+			CustodianPID = 0,
 			WalWriterPID = 0,
 			WalReceiverPID = 0,
 			AutoVacPID = 0,
@@ -548,6 +549,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartArchiver()			StartChildProcess(ArchiverProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
+#define StartCustodian()		StartChildProcess(CustodianProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
 
@@ -1823,13 +1825,16 @@ ServerLoop(void)
 		/*
 		 * If no background writer process is running, and we are not in a
 		 * state that prevents it, start one.  It doesn't matter if this
-		 * fails, we'll just try again later.  Likewise for the checkpointer.
+		 * fails, we'll just try again later.  Likewise for the checkpointer
+		 * and custodian.
 		 */
 		if (pmState == PM_RUN || pmState == PM_RECOVERY ||
 			pmState == PM_HOT_STANDBY || pmState == PM_STARTUP)
 		{
 			if (CheckpointerPID == 0)
 				CheckpointerPID = StartCheckpointer();
+			if (CustodianPID == 0)
+				CustodianPID = StartCustodian();
 			if (BgWriterPID == 0)
 				BgWriterPID = StartBackgroundWriter();
 		}
@@ -2769,6 +2774,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(BgWriterPID, SIGHUP);
 		if (CheckpointerPID != 0)
 			signal_child(CheckpointerPID, SIGHUP);
+		if (CustodianPID != 0)
+			signal_child(CustodianPID, SIGHUP);
 		if (WalWriterPID != 0)
 			signal_child(WalWriterPID, SIGHUP);
 		if (WalReceiverPID != 0)
@@ -3089,6 +3096,8 @@ reaper(SIGNAL_ARGS)
 			 */
 			if (CheckpointerPID == 0)
 				CheckpointerPID = StartCheckpointer();
+			if (CustodianPID == 0)
+				CustodianPID = StartCustodian();
 			if (BgWriterPID == 0)
 				BgWriterPID = StartBackgroundWriter();
 			if (WalWriterPID == 0)
@@ -3182,6 +3191,20 @@ reaper(SIGNAL_ARGS)
 			continue;
 		}
 
+		/*
+		 * Was it the custodian?  Normal exit can be ignored; we'll start a
+		 * new one at the next iteration of the postmaster's main loop, if
+		 * necessary.  Any other exit condition is treated as a crash.
+		 */
+		if (pid == CustodianPID)
+		{
+			CustodianPID = 0;
+			if (!EXIT_STATUS_0(exitstatus))
+				HandleChildCrash(pid, exitstatus,
+								 _("custodian process"));
+			continue;
+		}
+
 		/*
 		 * Was it the wal writer?  Normal exit can be ignored; we'll start a
 		 * new one at the next iteration of the postmaster's main loop, if
@@ -3639,6 +3662,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(CheckpointerPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+	/* Take care of the custodian too */
+	if (pid == CustodianPID)
+		CustodianPID = 0;
+	else if (CustodianPID != 0 && take_action)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) CustodianPID)));
+		signal_child(CustodianPID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/* Take care of the walwriter too */
 	if (pid == WalWriterPID)
 		WalWriterPID = 0;
@@ -3816,6 +3851,9 @@ PostmasterStateMachine(void)
 		/* and the bgwriter too */
 		if (BgWriterPID != 0)
 			signal_child(BgWriterPID, SIGTERM);
+		/* and the custodian too */
+		if (CustodianPID != 0)
+			signal_child(CustodianPID, SIGTERM);
 		/* and the walwriter too */
 		if (WalWriterPID != 0)
 			signal_child(WalWriterPID, SIGTERM);
@@ -3853,6 +3891,7 @@ PostmasterStateMachine(void)
 			BgWriterPID == 0 &&
 			(CheckpointerPID == 0 ||
 			 (!FatalError && Shutdown < ImmediateShutdown)) &&
+			CustodianPID == 0 &&
 			WalWriterPID == 0 &&
 			AutoVacPID == 0)
 		{
@@ -3942,6 +3981,7 @@ PostmasterStateMachine(void)
 			Assert(WalReceiverPID == 0);
 			Assert(BgWriterPID == 0);
 			Assert(CheckpointerPID == 0);
+			Assert(CustodianPID == 0);
 			Assert(WalWriterPID == 0);
 			Assert(AutoVacPID == 0);
 			/* syslogger is not considered here */
@@ -4135,6 +4175,8 @@ TerminateChildren(int signal)
 		signal_child(BgWriterPID, signal);
 	if (CheckpointerPID != 0)
 		signal_child(CheckpointerPID, signal);
+	if (CustodianPID != 0)
+		signal_child(CustodianPID, signal);
 	if (WalWriterPID != 0)
 		signal_child(WalWriterPID, signal);
 	if (WalReceiverPID != 0)
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 1a6f527051..b19d743cab 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -30,6 +30,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/custodian.h"
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/origin.h"
@@ -129,6 +130,7 @@ CalculateShmemSize(int *num_semaphores)
 	size = add_size(size, PMSignalShmemSize());
 	size = add_size(size, ProcSignalShmemSize());
 	size = add_size(size, CheckpointerShmemSize());
+	size = add_size(size, CustodianShmemSize());
 	size = add_size(size, AutoVacuumShmemSize());
 	size = add_size(size, ReplicationSlotsShmemSize());
 	size = add_size(size, ReplicationOriginShmemSize());
@@ -277,6 +279,7 @@ CreateSharedMemoryAndSemaphores(void)
 	PMSignalShmemInit();
 	ProcSignalShmemInit();
 	CheckpointerShmemInit();
+	CustodianShmemInit();
 	AutoVacuumShmemInit();
 	ReplicationSlotsShmemInit();
 	ReplicationOriginShmemInit();
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 37aaab1338..f297f489c9 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -180,6 +180,7 @@ InitProcGlobal(void)
 	ProcGlobal->startupBufferPinWaitBufId = -1;
 	ProcGlobal->walwriterLatch = NULL;
 	ProcGlobal->checkpointerLatch = NULL;
+	ProcGlobal->custodianLatch = NULL;
 	pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO);
 	pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO);
 
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 87c15b9c6f..469768c4e4 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -224,6 +224,9 @@ pgstat_get_wait_activity(WaitEventActivity w)
 		case WAIT_EVENT_CHECKPOINTER_MAIN:
 			event_name = "CheckpointerMain";
 			break;
+		case WAIT_EVENT_CUSTODIAN_MAIN:
+			event_name = "CustodianMain";
+			break;
 		case WAIT_EVENT_LOGICAL_APPLY_MAIN:
 			event_name = "LogicalApplyMain";
 			break;
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index b25bd0e583..66bf42e5b1 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -273,6 +273,9 @@ GetBackendTypeDesc(BackendType backendType)
 		case B_CHECKPOINTER:
 			backendDesc = "checkpointer";
 			break;
+		case B_CUSTODIAN:
+			backendDesc = "custodian";
+			break;
 		case B_STARTUP:
 			backendDesc = "startup";
 			break;
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 0af130fbc5..ffe9404c68 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -330,6 +330,7 @@ typedef enum BackendType
 	B_BG_WORKER,
 	B_BG_WRITER,
 	B_CHECKPOINTER,
+	B_CUSTODIAN,
 	B_STARTUP,
 	B_WAL_RECEIVER,
 	B_WAL_SENDER,
@@ -433,6 +434,7 @@ typedef enum
 	BgWriterProcess,
 	ArchiverProcess,
 	CheckpointerProcess,
+	CustodianProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
 
@@ -445,6 +447,7 @@ extern PGDLLIMPORT AuxProcType MyAuxProcType;
 #define AmBackgroundWriterProcess() (MyAuxProcType == BgWriterProcess)
 #define AmArchiverProcess()			(MyAuxProcType == ArchiverProcess)
 #define AmCheckpointerProcess()		(MyAuxProcType == CheckpointerProcess)
+#define AmCustodianProcess()		(MyAuxProcType == CustodianProcess)
 #define AmWalWriterProcess()		(MyAuxProcType == WalWriterProcess)
 #define AmWalReceiverProcess()		(MyAuxProcType == WalReceiverProcess)
 
diff --git a/src/include/postmaster/custodian.h b/src/include/postmaster/custodian.h
new file mode 100644
index 0000000000..c95a7c7de6
--- /dev/null
+++ b/src/include/postmaster/custodian.h
@@ -0,0 +1,20 @@
+/*-------------------------------------------------------------------------
+ *
+ * custodian.h
+ *   Exports from postmaster/custodian.c.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/custodian.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _CUSTODIAN_H
+#define _CUSTODIAN_H
+
+extern void CustodianMain(void) pg_attribute_noreturn();
+extern Size CustodianShmemSize(void);
+extern void CustodianShmemInit(void);
+extern void RequestCustodian(int flags);
+
+#endif						/* _CUSTODIAN_H */
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 2579e619eb..467421e371 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -394,6 +394,8 @@ typedef struct PROC_HDR
 	Latch	   *walwriterLatch;
 	/* Checkpointer process's latch */
 	Latch	   *checkpointerLatch;
+	/* Custodian process's latch */
+	Latch	   *custodianLatch;
 	/* Current shared estimate of appropriate spins_per_delay value */
 	int			spins_per_delay;
 	/* Buffer id of the buffer that Startup process waits for pin on, or -1 */
@@ -411,11 +413,12 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs;
  * We set aside some extra PGPROC structures for auxiliary processes,
  * ie things that aren't full-fledged backends but need shmem access.
  *
- * Background writer, checkpointer, WAL writer and archiver run during normal
- * operation.  Startup process and WAL receiver also consume 2 slots, but WAL
- * writer is launched only after startup has exited, so we only need 5 slots.
+ * Background writer, checkpointer, custodian, WAL writer and archiver run
+ * during normal operation.  Startup process and WAL receiver also consume 2
+ * slots, but WAL writer is launched only after startup has exited, so we only
+ * need 6 slots.
  */
-#define NUM_AUXILIARY_PROCS		5
+#define NUM_AUXILIARY_PROCS		6
 
 /* configurable options */
 extern PGDLLIMPORT int DeadlockTimeout;
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index b578e2ec75..7524e197e5 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -40,6 +40,7 @@ typedef enum
 	WAIT_EVENT_BGWRITER_HIBERNATE,
 	WAIT_EVENT_BGWRITER_MAIN,
 	WAIT_EVENT_CHECKPOINTER_MAIN,
+	WAIT_EVENT_CUSTODIAN_MAIN,
 	WAIT_EVENT_LOGICAL_APPLY_MAIN,
 	WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
 	WAIT_EVENT_RECOVERY_WAL_STREAM,
-- 
2.25.1

>From 758003bef540e1174e381f6fd8cdb73dde13cab6 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossa...@amazon.com>
Date: Sun, 5 Dec 2021 19:38:20 -0800
Subject: [PATCH v6 2/6] Also remove pgsql_tmp directories during startup.

Presently, the server only removes the contents of the temporary
directories during startup, not the directory itself.  This changes
that to prepare for future commits that will move temporary file
cleanup to a separate auxiliary process.
---
 src/backend/postmaster/postmaster.c         |  2 +-
 src/backend/storage/file/fd.c               | 20 ++++++++++----------
 src/include/storage/fd.h                    |  4 ++--
 src/test/recovery/t/022_crash_temp_files.pl |  6 ++++--
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 5162ee9dec..e67370012f 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1127,7 +1127,7 @@ PostmasterMain(int argc, char *argv[])
 	 * safe to do so now, because we verified earlier that there are no
 	 * conflicting Postgres processes in this data directory.
 	 */
-	RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
+	RemovePgTempDir(PG_TEMP_FILES_DIR, true, false);
 #endif
 
 	/*
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 24704b6a02..aa6ac8f219 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -3160,7 +3160,7 @@ RemovePgTempFiles(void)
 	 * First process temp files in pg_default ($PGDATA/base)
 	 */
 	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
-	RemovePgTempFilesInDir(temp_path, true, false);
+	RemovePgTempDir(temp_path, true, false);
 	RemovePgTempRelationFiles("base");
 
 	/*
@@ -3176,7 +3176,7 @@ RemovePgTempFiles(void)
 
 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
-		RemovePgTempFilesInDir(temp_path, true, false);
+		RemovePgTempDir(temp_path, true, false);
 
 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
@@ -3209,7 +3209,7 @@ RemovePgTempFiles(void)
  * them separate.)
  */
 void
-RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
+RemovePgTempDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
 {
 	DIR		   *temp_dir;
 	struct dirent *temp_de;
@@ -3247,13 +3247,7 @@ RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
 			if (S_ISDIR(statbuf.st_mode))
 			{
 				/* recursively remove contents, then directory itself */
-				RemovePgTempFilesInDir(rm_path, false, true);
-
-				if (rmdir(rm_path) < 0)
-					ereport(LOG,
-							(errcode_for_file_access(),
-							 errmsg("could not remove directory \"%s\": %m",
-									rm_path)));
+				RemovePgTempDir(rm_path, false, true);
 			}
 			else
 			{
@@ -3271,6 +3265,12 @@ RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
 	}
 
 	FreeDir(temp_dir);
+
+	if (rmdir(tmpdirname) < 0)
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not remove directory \"%s\": %m",
+						tmpdirname)));
 }
 
 /* Process one tablespace directory, look for per-DB subdirectories */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 69549b000f..67a6ef4dbf 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -169,8 +169,8 @@ extern void AtEOXact_Files(bool isCommit);
 extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
 							  SubTransactionId parentSubid);
 extern void RemovePgTempFiles(void);
-extern void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
-								   bool unlink_all);
+extern void RemovePgTempDir(const char *tmpdirname, bool missing_ok,
+							bool unlink_all);
 extern bool looks_like_temp_rel_name(const char *name);
 
 extern int	pg_fsync(int fd);
diff --git a/src/test/recovery/t/022_crash_temp_files.pl b/src/test/recovery/t/022_crash_temp_files.pl
index 53a55c7a8a..8ed8afeadd 100644
--- a/src/test/recovery/t/022_crash_temp_files.pl
+++ b/src/test/recovery/t/022_crash_temp_files.pl
@@ -152,7 +152,8 @@ $node->poll_query_until('postgres', undef, '');
 
 # Check for temporary files
 is( $node->safe_psql(
-		'postgres', 'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
+		'postgres',
+		'SELECT COUNT(1) FROM pg_ls_dir($$base$$) WHERE pg_ls_dir = \'pgsql_tmp\''),
 	qq(0),
 	'no temporary files');
 
@@ -268,7 +269,8 @@ $node->restart();
 
 # Check the temporary files -- should be gone
 is( $node->safe_psql(
-		'postgres', 'SELECT COUNT(1) FROM pg_ls_dir($$base/pgsql_tmp$$)'),
+		'postgres',
+		'SELECT COUNT(1) FROM pg_ls_dir($$base$$) WHERE pg_ls_dir = \'pgsql_tmp\''),
 	qq(0),
 	'temporary file was removed');
 
-- 
2.25.1

>From 5e95666efa31d6c8aa351e430c37ead6e27acb72 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossa...@amazon.com>
Date: Sun, 5 Dec 2021 21:16:44 -0800
Subject: [PATCH v6 3/6] Split pgsql_tmp cleanup into two stages.

First, pgsql_tmp directories will be renamed to stage them for
removal.  Then, all files in pgsql_tmp are removed before removing
the staged directories themselves.  This change is being made in
preparation for a follow-up change to offload most temporary file
cleanup to the new custodian process.

Note that temporary relation files cannot be cleaned up via the
aforementioned strategy and will not be offloaded to the custodian.
---
 src/backend/postmaster/postmaster.c |   8 +-
 src/backend/storage/file/fd.c       | 174 ++++++++++++++++++++++++----
 src/include/storage/fd.h            |   2 +-
 3 files changed, 160 insertions(+), 24 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index e67370012f..82aa0c6307 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1402,7 +1402,8 @@ PostmasterMain(int argc, char *argv[])
 	 * Remove old temporary files.  At this point there can be no other
 	 * Postgres processes running in this directory, so this should be safe.
 	 */
-	RemovePgTempFiles();
+	RemovePgTempFiles(true, true);
+	RemovePgTempFiles(false, false);
 
 	/*
 	 * Initialize the autovacuum subsystem (again, no process start yet)
@@ -4053,7 +4054,10 @@ PostmasterStateMachine(void)
 
 		/* remove leftover temporary files after a crash */
 		if (remove_temp_files_after_crash)
-			RemovePgTempFiles();
+		{
+			RemovePgTempFiles(true, true);
+			RemovePgTempFiles(false, false);
+		}
 
 		/* allow background workers to immediately restart */
 		ResetBackgroundWorkerCrashTimes();
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index aa6ac8f219..79ca3a5be9 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -112,6 +112,8 @@
 #define PG_FLUSH_DATA_WORKS 1
 #endif
 
+#define PG_TEMP_DIR_TO_REMOVE_PREFIX (PG_TEMP_FILES_DIR "_to_remove_")
+
 /*
  * We must leave some file descriptors free for system(), the dynamic loader,
  * and other code that tries to open files without consulting fd.c.  This
@@ -338,6 +340,8 @@ static void BeforeShmemExit_Files(int code, Datum arg);
 static void CleanupTempFiles(bool isCommit, bool isProcExit);
 static void RemovePgTempRelationFiles(const char *tsdirname);
 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
+static void StagePgTempDirForRemoval(const char *tmp_dir);
+static void RemoveStagedPgTempDirs(const char *spc_dir);
 
 static void walkdir(const char *path,
 					void (*action) (const char *fname, bool isdir, int elevel),
@@ -3133,24 +3137,20 @@ CleanupTempFiles(bool isCommit, bool isProcExit)
  * Remove temporary and temporary relation files left over from a prior
  * postmaster session
  *
- * This should be called during postmaster startup.  It will forcibly
- * remove any leftover files created by OpenTemporaryFile and any leftover
- * temporary relation files created by mdcreate.
+ * If stage is true, this function will simply rename all pgsql_tmp directories
+ * to stage them for removal at a later time.  If stage is false, this function
+ * will delete all files in the staged directories as well as the directories
+ * themselves.
  *
- * During post-backend-crash restart cycle, this routine is called when
- * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
- * queries are using temp files could result in useless storage usage that can
- * only be reclaimed by a service restart. The argument against enabling it is
- * that someone might want to examine the temporary files for debugging
- * purposes. This does however mean that OpenTemporaryFile had better allow for
- * collision with an existing temp file name.
+ * If remove_relation_files is true, this function will remove the temporary
+ * relation files.  Otherwise, this step is skipped.
  *
  * NOTE: this function and its subroutines generally report syscall failures
  * with ereport(LOG) and keep going.  Removing temp files is not so critical
  * that we should fail to start the database when we can't do it.
  */
 void
-RemovePgTempFiles(void)
+RemovePgTempFiles(bool stage, bool remove_relation_files)
 {
 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
 	DIR		   *spc_dir;
@@ -3159,9 +3159,16 @@ RemovePgTempFiles(void)
 	/*
 	 * First process temp files in pg_default ($PGDATA/base)
 	 */
-	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
-	RemovePgTempDir(temp_path, true, false);
-	RemovePgTempRelationFiles("base");
+	if (stage)
+	{
+		snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
+		StagePgTempDirForRemoval(temp_path);
+	}
+	else
+		RemoveStagedPgTempDirs("base");
+
+	if (remove_relation_files)
+		RemovePgTempRelationFiles("base");
 
 	/*
 	 * Cycle through temp directories for all non-default tablespaces.
@@ -3174,13 +3181,26 @@ RemovePgTempFiles(void)
 			strcmp(spc_de->d_name, "..") == 0)
 			continue;
 
-		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
-				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
-		RemovePgTempDir(temp_path, true, false);
+		if (stage)
+		{
+			snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
+					 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
+					 PG_TEMP_FILES_DIR);
+			StagePgTempDirForRemoval(temp_path);
+		}
+		else
+		{
+			snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+					 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+			RemoveStagedPgTempDirs(temp_path);
+		}
 
-		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
-				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
-		RemovePgTempRelationFiles(temp_path);
+		if (remove_relation_files)
+		{
+			snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+					 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+			RemovePgTempRelationFiles(temp_path);
+		}
 	}
 
 	FreeDir(spc_dir);
@@ -3194,7 +3214,119 @@ RemovePgTempFiles(void)
 }
 
 /*
- * Process one pgsql_tmp directory for RemovePgTempFiles.
+ * StagePgTempDirForRemoval
+ *
+ * This function renames the given directory with a special prefix that
+ * RemoveStagedPgTempDirs() will know to look for.  An integer is appended to
+ * the end of the new directory name in case previously staged pgsql_tmp
+ * directories have not yet been removed.
+ */
+static void
+StagePgTempDirForRemoval(const char *tmp_dir)
+{
+	DIR		   *dir;
+	char		stage_path[MAXPGPATH * 2];
+	char		parent_path[MAXPGPATH * 2];
+	struct stat statbuf;
+
+	/*
+	 * If tmp_dir doesn't exist, there is nothing to stage.
+	 */
+	dir = AllocateDir(tmp_dir);
+	if (dir == NULL)
+	{
+		if (errno != ENOENT)
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not open directory \"%s\": %m", tmp_dir)));
+		return;
+	}
+	FreeDir(dir);
+
+	strlcpy(parent_path, tmp_dir, MAXPGPATH * 2);
+	get_parent_directory(parent_path);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument is
+	 * just a file name (see comments in path.c), so handle that as being the
+	 * current directory.
+	 */
+	if (strlen(parent_path) == 0)
+		strlcpy(parent_path, ".", MAXPGPATH * 2);
+
+	/*
+	 * Find a name for the stage directory.  We just increment an integer at the
+	 * end of the name until we find one that doesn't exist.
+	 */
+	for (int n = 0; n <= INT_MAX; n++)
+	{
+		snprintf(stage_path, sizeof(stage_path), "%s/%s%d", parent_path,
+				 PG_TEMP_DIR_TO_REMOVE_PREFIX, n);
+
+		if (stat(stage_path, &statbuf) != 0)
+		{
+			if (errno == ENOENT)
+				break;
+
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", stage_path)));
+			return;
+		}
+
+		stage_path[0] = '\0';
+	}
+
+	/*
+	 * In the unlikely event that we couldn't find a name for the stage
+	 * directory, bail out.
+	 */
+	if (stage_path[0] == '\0')
+	{
+		ereport(LOG,
+				(errmsg("could not stage \"%s\" for deletion",
+						tmp_dir)));
+		return;
+	}
+
+	/*
+	 * Rename the temporary directory.
+	 */
+	if (rename(tmp_dir, stage_path) != 0)
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not rename directory \"%s\" to \"%s\": %m",
+						tmp_dir, stage_path)));
+}
+
+/*
+ * RemoveStagedPgTempDirs
+ *
+ * This function removes all pgsql_tmp directories that have been staged for
+ * removal by StagePgTempDirForRemoval() in the given tablespace directory.
+ */
+static void
+RemoveStagedPgTempDirs(const char *spc_dir)
+{
+	char		temp_path[MAXPGPATH * 2];
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(spc_dir);
+	while ((de = ReadDirExtended(dir, spc_dir, LOG)) != NULL)
+	{
+		if (strncmp(de->d_name, PG_TEMP_DIR_TO_REMOVE_PREFIX,
+					strlen(PG_TEMP_DIR_TO_REMOVE_PREFIX)) != 0)
+			continue;
+
+		snprintf(temp_path, sizeof(temp_path), "%s/%s", spc_dir, de->d_name);
+		RemovePgTempDir(temp_path, true, false);
+	}
+	FreeDir(dir);
+}
+
+/*
+ * Process one pgsql_tmp directory for RemoveStagedPgTempDirs.
  *
  * If missing_ok is true, it's all right for the named directory to not exist.
  * Any other problem results in a LOG message.  (missing_ok should be true at
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 67a6ef4dbf..3b0d6f62d6 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -168,7 +168,7 @@ extern Oid	GetNextTempTableSpace(void);
 extern void AtEOXact_Files(bool isCommit);
 extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
 							  SubTransactionId parentSubid);
-extern void RemovePgTempFiles(void);
+extern void RemovePgTempFiles(bool stage, bool remove_relation_files);
 extern void RemovePgTempDir(const char *tmpdirname, bool missing_ok,
 							bool unlink_all);
 extern bool looks_like_temp_rel_name(const char *name);
-- 
2.25.1

>From 43042799b96b588a446c509637b5acf570e2a325 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossa...@amazon.com>
Date: Sun, 5 Dec 2021 21:42:52 -0800
Subject: [PATCH v6 4/6] Move pgsql_tmp file removal to custodian process.

With this change, startup (and restart after a crash) simply
renames the pgsql_tmp directories, and the custodian process
actually removes all the files in the staged directories as well as
the staged directories themselves.  This should help avoid long
startup delays due to many leftover temporary files.
---
 src/backend/postmaster/custodian.c  | 14 +++++++++++++-
 src/backend/postmaster/postmaster.c | 14 +++++++++-----
 src/backend/storage/file/fd.c       | 21 +++++++++++++++------
 src/include/postmaster/custodian.h  |  3 +++
 4 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/src/backend/postmaster/custodian.c b/src/backend/postmaster/custodian.c
index db00282658..a0ec94ea5c 100644
--- a/src/backend/postmaster/custodian.c
+++ b/src/backend/postmaster/custodian.c
@@ -196,7 +196,19 @@ CustodianMain(void)
 		CustodianShmem->cust_flags = 0;
 		SpinLockRelease(&CustodianShmem->cust_lck);
 
-		/* TODO: offloaded tasks go here */
+		/*
+		 * Remove any pgsql_tmp directories that have been staged for deletion.
+		 * Since pgsql_tmp directories can accumulate many files, removing all
+		 * of the files during startup (which we used to do) can take a very
+		 * long time.  To avoid delaying startup, we simply have startup rename
+		 * the temporary directories, and we clean them up here.
+		 *
+		 * pgsql_tmp directories are not staged or cleaned in single-user mode,
+		 * so we don't need any extra handling outside of the custodian process
+		 * for this.
+		 */
+		if (flags & CUSTODIAN_REMOVE_TEMP_FILES)
+			RemovePgTempFiles(false, false);
 
 		/* Calculate how long to sleep */
 		end_time = (pg_time_t) time(NULL);
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 82aa0c6307..b67f8828df 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1401,9 +1401,11 @@ PostmasterMain(int argc, char *argv[])
 	/*
 	 * Remove old temporary files.  At this point there can be no other
 	 * Postgres processes running in this directory, so this should be safe.
+	 *
+	 * Note that this just stages the pgsql_tmp directories for deletion.  The
+	 * custodian process is responsible for actually removing the files.
 	 */
 	RemovePgTempFiles(true, true);
-	RemovePgTempFiles(false, false);
 
 	/*
 	 * Initialize the autovacuum subsystem (again, no process start yet)
@@ -4052,12 +4054,14 @@ PostmasterStateMachine(void)
 		ereport(LOG,
 				(errmsg("all server processes terminated; reinitializing")));
 
-		/* remove leftover temporary files after a crash */
+		/*
+		 * Remove leftover temporary files after a crash.
+		 *
+		 * Note that this just stages the pgsql_tmp directories for deletion.
+		 * The custodian process is responsible for actually removing the files.
+		 */
 		if (remove_temp_files_after_crash)
-		{
 			RemovePgTempFiles(true, true);
-			RemovePgTempFiles(false, false);
-		}
 
 		/* allow background workers to immediately restart */
 		ResetBackgroundWorkerCrashTimes();
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 79ca3a5be9..46dc1925a2 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -97,6 +97,7 @@
 #include "pgstat.h"
 #include "port/pg_iovec.h"
 #include "portability/mem.h"
+#include "postmaster/custodian.h"
 #include "postmaster/startup.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
@@ -1640,9 +1641,9 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
  *
  * Directories created within the top-level temporary directory should begin
  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
- * deleted at startup by RemovePgTempFiles().  Further subdirectories below
- * that do not need any particular prefix.
-*/
+ * deleted by RemovePgTempFiles().  Further subdirectories below that do not
+ * need any particular prefix.
+ */
 void
 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
 {
@@ -1840,9 +1841,9 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
  *
  * If the file is inside the top-level temporary directory, its name should
  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
- * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
- * inside a directory created with PathNameCreateTemporaryDir(), in which case
- * the prefix isn't needed.
+ * and deleted by RemovePgTempFiles().  Alternatively, it can be inside a
+ * directory created with PathNameCreateTemporaryDir(), in which case the prefix
+ * isn't needed.
  */
 File
 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
@@ -3211,6 +3212,14 @@ RemovePgTempFiles(bool stage, bool remove_relation_files)
 	 * would create a race condition.  It's done separately, earlier in
 	 * postmaster startup.
 	 */
+
+	/*
+	 * If we just staged some pgsql_tmp directories for removal, wake up the
+	 * custodian process so that it deletes all the files in the staged
+	 * directories as well as the directories themselves.
+	 */
+	if (stage)
+		RequestCustodian(CUSTODIAN_REMOVE_TEMP_FILES);
 }
 
 /*
diff --git a/src/include/postmaster/custodian.h b/src/include/postmaster/custodian.h
index c95a7c7de6..f6dcd9ddef 100644
--- a/src/include/postmaster/custodian.h
+++ b/src/include/postmaster/custodian.h
@@ -17,4 +17,7 @@ extern Size CustodianShmemSize(void);
 extern void CustodianShmemInit(void);
 extern void RequestCustodian(int flags);
 
+/* flags for RequestCustodian() */
+#define CUSTODIAN_REMOVE_TEMP_FILES		0x0001
+
 #endif						/* _CUSTODIAN_H */
-- 
2.25.1

>From a58a6bb70785a557a150680b64cd8ce78ce1b73a Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossa...@amazon.com>
Date: Sun, 5 Dec 2021 22:02:40 -0800
Subject: [PATCH v6 5/6] Move removal of old serialized snapshots to custodian.

This was only done during checkpoints because it was a convenient
place to put it.  However, if there are many snapshots to remove,
it can significantly extend checkpoint time.  To avoid this, move
this work to the newly-introduced custodian process.
---
 src/backend/access/transam/xlog.c           |  6 ++++--
 src/backend/postmaster/custodian.c          | 12 ++++++++++++
 src/backend/replication/logical/snapbuild.c |  9 ++++-----
 src/include/postmaster/custodian.h          |  3 ++-
 src/include/replication/snapbuild.h         |  2 +-
 5 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8764084e21..621bda0844 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -75,13 +75,13 @@
 #include "port/atomics.h"
 #include "port/pg_iovec.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/custodian.h"
 #include "postmaster/startup.h"
 #include "postmaster/walwriter.h"
 #include "replication/basebackup.h"
 #include "replication/logical.h"
 #include "replication/origin.h"
 #include "replication/slot.h"
-#include "replication/snapbuild.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -6840,10 +6840,12 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
-	CheckPointSnapBuild();
 	CheckPointLogicalRewriteHeap();
 	CheckPointReplicationOrigin();
 
+	/* tasks offloaded to custodian */
+	RequestCustodian(CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS);
+
 	/* Write out all dirty data in SLRUs and the main buffer pool */
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
diff --git a/src/backend/postmaster/custodian.c b/src/backend/postmaster/custodian.c
index a0ec94ea5c..861de882c6 100644
--- a/src/backend/postmaster/custodian.c
+++ b/src/backend/postmaster/custodian.c
@@ -31,6 +31,7 @@
 #include "pgstat.h"
 #include "postmaster/custodian.h"
 #include "postmaster/interrupt.h"
+#include "replication/snapbuild.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
@@ -210,6 +211,17 @@ CustodianMain(void)
 		if (flags & CUSTODIAN_REMOVE_TEMP_FILES)
 			RemovePgTempFiles(false, false);
 
+		/*
+		 * Remove serialized snapshots that are no longer required by any
+		 * logical replication slot.
+		 *
+		 * It is not important for these to be removed in single-user mode, so
+		 * we don't need any extra handling outside of the custodian process for
+		 * this.
+		 */
+		if (flags & CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS)
+			RemoveOldSerializedSnapshots();
+
 		/* Calculate how long to sleep */
 		end_time = (pg_time_t) time(NULL);
 		elapsed_secs = end_time - start_time;
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 1119a12db9..42eb064bd8 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1911,14 +1911,13 @@ snapshot_not_interesting:
 
 /*
  * Remove all serialized snapshots that are not required anymore because no
- * slot can need them. This doesn't actually have to run during a checkpoint,
- * but it's a convenient point to schedule this.
+ * slot can need them.
  *
- * NB: We run this during checkpoints even if logical decoding is disabled so
- * we cleanup old slots at some point after it got disabled.
+ * NB: We run this even if logical decoding is disabled so we cleanup old slots
+ * at some point after it got disabled.
  */
 void
-CheckPointSnapBuild(void)
+RemoveOldSerializedSnapshots(void)
 {
 	XLogRecPtr	cutoff;
 	XLogRecPtr	redo;
diff --git a/src/include/postmaster/custodian.h b/src/include/postmaster/custodian.h
index f6dcd9ddef..769c07f2c9 100644
--- a/src/include/postmaster/custodian.h
+++ b/src/include/postmaster/custodian.h
@@ -18,6 +18,7 @@ extern void CustodianShmemInit(void);
 extern void RequestCustodian(int flags);
 
 /* flags for RequestCustodian() */
-#define CUSTODIAN_REMOVE_TEMP_FILES		0x0001
+#define CUSTODIAN_REMOVE_TEMP_FILES				0x0001
+#define CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS	0x0002
 
 #endif						/* _CUSTODIAN_H */
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
index d179251aad..55a2beb434 100644
--- a/src/include/replication/snapbuild.h
+++ b/src/include/replication/snapbuild.h
@@ -57,7 +57,7 @@ struct ReorderBuffer;
 struct xl_heap_new_cid;
 struct xl_running_xacts;
 
-extern void CheckPointSnapBuild(void);
+extern void RemoveOldSerializedSnapshots(void);
 
 extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *cache,
 										  TransactionId xmin_horizon, XLogRecPtr start_lsn,
-- 
2.25.1

>From 0add8bb19a4ee83c6a6ec1f313329d737bf304a5 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossa...@amazon.com>
Date: Sun, 12 Dec 2021 22:07:11 -0800
Subject: [PATCH v6 6/6] Move removal of old logical rewrite mapping files to
 custodian.

If there are many such files to remove, checkpoints can take much
longer.  To avoid this, move this work to the newly-introduced
custodian process.
---
 src/backend/access/heap/rewriteheap.c | 79 +++++++++++++++++++++++----
 src/backend/postmaster/custodian.c    | 44 +++++++++++++++
 src/include/access/rewriteheap.h      |  1 +
 src/include/postmaster/custodian.h    |  5 ++
 4 files changed, 119 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736..edeab65e60 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -116,6 +116,7 @@
 #include "lib/ilist.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "postmaster/custodian.h"
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "storage/bufmgr.h"
@@ -1182,7 +1183,8 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
  * Perform a checkpoint for logical rewrite mappings
  *
  * This serves two tasks:
- * 1) Remove all mappings not needed anymore based on the logical restart LSN
+ * 1) Alert the custodian to remove all mappings not needed anymore based on the
+ *    logical restart LSN
  * 2) Flush all remaining mappings to disk, so that replay after a checkpoint
  *	  only has to deal with the parts of a mapping that have been written out
  *	  after the checkpoint started.
@@ -1210,6 +1212,10 @@ CheckPointLogicalRewriteHeap(void)
 	if (cutoff != InvalidXLogRecPtr && redo < cutoff)
 		cutoff = redo;
 
+	/* let the custodian know what it can remove */
+	CustodianSetLogicalRewriteCutoff(cutoff);
+	RequestCustodian(CUSTODIAN_REMOVE_REWRITE_MAPPINGS);
+
 	mappings_dir = AllocateDir("pg_logical/mappings");
 	while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
 	{
@@ -1240,15 +1246,7 @@ CheckPointLogicalRewriteHeap(void)
 
 		lsn = ((uint64) hi) << 32 | lo;
 
-		if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
-		{
-			elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
-			if (unlink(path) < 0)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
-		}
-		else
+		if (lsn >= cutoff && cutoff != InvalidXLogRecPtr)
 		{
 			/* on some operating systems fsyncing a file requires O_RDWR */
 			int			fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
@@ -1286,3 +1284,64 @@ CheckPointLogicalRewriteHeap(void)
 	/* persist directory entries to disk */
 	fsync_fname("pg_logical/mappings", true);
 }
+
+/*
+ * Remove all mappings not needed anymore based on the logical restart LSN saved
+ * by the checkpointer.  We use this saved value instead of calling
+ * ReplicationSlotsComputeLogicalRestartLSN() so that we don't interfere with an
+ * ongoing call to CheckPointLogicalRewriteHeap() that is flushing mappings to
+ * disk.
+ */
+void
+RemoveOldLogicalRewriteMappings(void)
+{
+	XLogRecPtr	cutoff;
+	DIR		   *mappings_dir;
+	struct dirent *mapping_de;
+	char		path[MAXPGPATH + 20];
+	bool		value_set = false;
+
+	cutoff = CustodianGetLogicalRewriteCutoff(&value_set);
+	if (!value_set)
+		return;
+
+	mappings_dir = AllocateDir("pg_logical/mappings");
+	while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
+	{
+		struct stat statbuf;
+		Oid			dboid;
+		Oid			relid;
+		XLogRecPtr	lsn;
+		TransactionId rewrite_xid;
+		TransactionId create_xid;
+		uint32		hi,
+					lo;
+
+		if (strcmp(mapping_de->d_name, ".") == 0 ||
+			strcmp(mapping_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name);
+		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
+			continue;
+
+		/* Skip over files that cannot be ours. */
+		if (strncmp(mapping_de->d_name, "map-", 4) != 0)
+			continue;
+
+		if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
+				   &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6)
+			elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
+
+		lsn = ((uint64) hi) << 32 | lo;
+		if (lsn >= cutoff && cutoff != InvalidXLogRecPtr)
+			continue;
+
+		elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
+		if (unlink(path) < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not remove file \"%s\": %m", path)));
+	}
+	FreeDir(mappings_dir);
+}
diff --git a/src/backend/postmaster/custodian.c b/src/backend/postmaster/custodian.c
index 861de882c6..0ce4edcf61 100644
--- a/src/backend/postmaster/custodian.c
+++ b/src/backend/postmaster/custodian.c
@@ -27,6 +27,7 @@
 
 #include <time.h>
 
+#include "access/rewriteheap.h"
 #include "libpq/pqsignal.h"
 #include "pgstat.h"
 #include "postmaster/custodian.h"
@@ -46,6 +47,9 @@ typedef struct
 {
 	slock_t		cust_lck;
 	int			cust_flags;
+
+	XLogRecPtr  logical_rewrite_mappings_cutoff;    /* can remove older mappings */
+	bool        logical_rewrite_mappings_cutoff_set;
 } CustodianShmemStruct;
 
 static CustodianShmemStruct *CustodianShmem;
@@ -222,6 +226,16 @@ CustodianMain(void)
 		if (flags & CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS)
 			RemoveOldSerializedSnapshots();
 
+		/*
+		 * Remove logical rewrite mapping files that are no longer needed.
+		 *
+		 * It is not important for these to be removed in single-user mode, so
+		 * we don't need any extra handling outside of the custodian process for
+		 * this.
+		 */
+		if (flags & CUSTODIAN_REMOVE_REWRITE_MAPPINGS)
+			RemoveOldLogicalRewriteMappings();
+
 		/* Calculate how long to sleep */
 		end_time = (pg_time_t) time(NULL);
 		elapsed_secs = end_time - start_time;
@@ -274,3 +288,33 @@ RequestCustodian(int flags)
 	if (ProcGlobal->custodianLatch)
 		SetLatch(ProcGlobal->custodianLatch);
 }
+
+/*
+ * Used by CheckPointLogicalRewriteHeap() to tell the custodian which logical
+ * rewrite mapping files it can remove.
+ */
+void
+CustodianSetLogicalRewriteCutoff(XLogRecPtr cutoff)
+{
+	SpinLockAcquire(&CustodianShmem->cust_lck);
+	CustodianShmem->logical_rewrite_mappings_cutoff = cutoff;
+	CustodianShmem->logical_rewrite_mappings_cutoff_set = true;
+	SpinLockRelease(&CustodianShmem->cust_lck);
+}
+
+/*
+ * Used by the custodian to determine which logical rewrite mapping files it can
+ * remove.
+ */
+XLogRecPtr
+CustodianGetLogicalRewriteCutoff(bool *value_set)
+{
+	XLogRecPtr  cutoff;
+
+	SpinLockAcquire(&CustodianShmem->cust_lck);
+	cutoff = CustodianShmem->logical_rewrite_mappings_cutoff;
+	*value_set = CustodianShmem->logical_rewrite_mappings_cutoff_set;
+	SpinLockRelease(&CustodianShmem->cust_lck);
+
+	return cutoff;
+}
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index 3e27790b3f..61d7aa8ed8 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -53,5 +53,6 @@ typedef struct LogicalRewriteMappingData
  */
 #define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x"
 extern void CheckPointLogicalRewriteHeap(void);
+extern void RemoveOldLogicalRewriteMappings(void);
 
 #endif							/* REWRITE_HEAP_H */
diff --git a/src/include/postmaster/custodian.h b/src/include/postmaster/custodian.h
index 769c07f2c9..1af96ebd02 100644
--- a/src/include/postmaster/custodian.h
+++ b/src/include/postmaster/custodian.h
@@ -12,13 +12,18 @@
 #ifndef _CUSTODIAN_H
 #define _CUSTODIAN_H
 
+#include "access/xlogdefs.h"
+
 extern void CustodianMain(void) pg_attribute_noreturn();
 extern Size CustodianShmemSize(void);
 extern void CustodianShmemInit(void);
 extern void RequestCustodian(int flags);
+extern void CustodianSetLogicalRewriteCutoff(XLogRecPtr cutoff);
+extern XLogRecPtr CustodianGetLogicalRewriteCutoff(bool *value_set);
 
 /* flags for RequestCustodian() */
 #define CUSTODIAN_REMOVE_TEMP_FILES				0x0001
 #define CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS	0x0002
+#define CUSTODIAN_REMOVE_REWRITE_MAPPINGS		0x0004
 
 #endif						/* _CUSTODIAN_H */
-- 
2.25.1

Reply via email to