Re: Bump soft open file limit (RLIMIT_NOFILE) to hard limit on startup

Jelte Fennema-Nio Mon, 24 Feb 2025 02:59:11 -0800

On Mon Feb 24, 2025 at 10:56 AM CET, Jelte Fennema-Nio wrote:

Great! Attached are the updated other patches, I think I addressed all
feedback.


Ughh, a compiler warning snuck on windows during some of my final
refactoring. Fixed now.

From ec72e05e87c73dee3de73f9a6586e8e8db2d919e Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <github-t...@jeltef.nl>
Date: Sun, 23 Feb 2025 16:52:29 +0100
Subject: [PATCH v5 1/3] Adds a helper for places that shell out to system()

We need to call system() in a few places, and to do so safely we need
some pre/post-fork logic. This encapsulates that logic into a single
System helper function. The main reason to do this is that in a follow
up commit we want to add even more logic pre and post-fork.
---
 src/backend/access/transam/xlogarchive.c | 28 +-----------
 src/backend/archive/shell_archive.c      |  6 +--
 src/backend/postmaster/startup.c         | 54 +++++++++++++++++-------
 src/include/postmaster/startup.h         |  3 +-
 4 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index 1ef1713c91a..59a0f191339 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -158,27 +158,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 			(errmsg_internal("executing restore command \"%s\"",
 							 xlogRestoreCmd)));
 
-	fflush(NULL);
-	pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND);
-
-	/*
-	 * PreRestoreCommand() informs the SIGTERM handler for the startup process
-	 * that it should proc_exit() right away.  This is done for the duration
-	 * of the system() call because there isn't a good way to break out while
-	 * it is executing.  Since we might call proc_exit() in a signal handler,
-	 * it is best to put any additional logic before or after the
-	 * PreRestoreCommand()/PostRestoreCommand() section.
-	 */
-	PreRestoreCommand();
-
-	/*
-	 * Copy xlog from archival storage to XLOGDIR
-	 */
-	rc = system(xlogRestoreCmd);
-
-	PostRestoreCommand();
-
-	pgstat_report_wait_end();
+	rc = System(xlogRestoreCmd, WAIT_EVENT_RESTORE_COMMAND);
 	pfree(xlogRestoreCmd);
 
 	if (rc == 0)
@@ -325,11 +305,7 @@ ExecuteRecoveryCommand(const char *command, const char *commandName,
 	/*
 	 * execute the constructed command
 	 */
-	fflush(NULL);
-	pgstat_report_wait_start(wait_event_info);
-	rc = system(xlogRecoveryCmd);
-	pgstat_report_wait_end();
-
+	rc = System(xlogRecoveryCmd, wait_event_info);
 	pfree(xlogRecoveryCmd);
 
 	if (rc != 0)
diff --git a/src/backend/archive/shell_archive.c b/src/backend/archive/shell_archive.c
index 828723afe47..7977e5a5092 100644
--- a/src/backend/archive/shell_archive.c
+++ b/src/backend/archive/shell_archive.c
@@ -22,6 +22,7 @@
 #include "archive/shell_archive.h"
 #include "common/percentrepl.h"
 #include "pgstat.h"
+#include "postmaster/startup.h"
 
 static bool shell_archive_configured(ArchiveModuleState *state);
 static bool shell_archive_file(ArchiveModuleState *state,
@@ -75,10 +76,7 @@ shell_archive_file(ArchiveModuleState *state, const char *file,
 			(errmsg_internal("executing archive command \"%s\"",
 							 xlogarchcmd)));
 
-	fflush(NULL);
-	pgstat_report_wait_start(WAIT_EVENT_ARCHIVE_COMMAND);
-	rc = system(xlogarchcmd);
-	pgstat_report_wait_end();
+	rc = System(xlogarchcmd, WAIT_EVENT_ARCHIVE_COMMAND);
 
 	if (rc != 0)
 	{
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index 88eab3d0baf..b0cd40c081e 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -33,6 +33,7 @@
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/timeout.h"
+#include "utils/wait_event_types.h"
 
 
 #ifndef USE_POSTMASTER_DEATH_SIGNAL
@@ -264,24 +265,45 @@ StartupProcessMain(const void *startup_data, size_t startup_data_len)
 	proc_exit(0);
 }
 
-void
-PreRestoreCommand(void)
+/*
+ * A custom wrapper around the system() function that calls the necessary
+ * functions pre/post-fork.
+ */
+int
+System(const char *command, uint32 wait_event_info)
 {
-	/*
-	 * Set in_restore_command to tell the signal handler that we should exit
-	 * right away on SIGTERM. We know that we're at a safe point to do that.
-	 * Check if we had already received the signal, so that we don't miss a
-	 * shutdown request received just before this.
-	 */
-	in_restore_command = true;
-	if (shutdown_requested)
-		proc_exit(1);
-}
+	int			rc;
 
-void
-PostRestoreCommand(void)
-{
-	in_restore_command = false;
+	fflush(NULL);
+	pgstat_report_wait_start(wait_event_info);
+
+	if (wait_event_info == WAIT_EVENT_RESTORE_COMMAND)
+	{
+		/*
+		 * Set in_restore_command to tell the signal handler that we should
+		 * exit right away on SIGTERM. This is done for the duration of the
+		 * system() call because there isn't a good way to break out while it
+		 * is executing.  Since we might call proc_exit() in a signal handler,
+		 * it is best to put any additional logic outside of this section
+		 * where in_restore_command is set to true.
+		 */
+		in_restore_command = true;
+
+		/*
+		 * Also check if we had already received the signal, so that we don't
+		 * miss a shutdown request received just before this.
+		 */
+		if (shutdown_requested)
+			proc_exit(1);
+	}
+
+	rc = system(command);
+
+	if (wait_event_info == WAIT_EVENT_RESTORE_COMMAND)
+		in_restore_command = false;
+
+	pgstat_report_wait_end();
+	return rc;
 }
 
 bool
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index ae0f6347fc0..09f5dcd3d03 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -27,8 +27,7 @@ extern PGDLLIMPORT int log_startup_progress_interval;
 
 extern void HandleStartupProcInterrupts(void);
 extern void StartupProcessMain(const void *startup_data, size_t startup_data_len) pg_attribute_noreturn();
-extern void PreRestoreCommand(void);
-extern void PostRestoreCommand(void);
+extern int	System(const char *command, uint32 wait_event_info);
 extern bool IsPromoteSignaled(void);
 extern void ResetPromoteSignaled(void);
 

base-commit: 454c182f8542890d0e2eac85f70d9a254a34fce3
-- 
2.43.0

From d04d0350510efc57dd1cd5947ef90f139972d3fc Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <github-t...@jeltef.nl>
Date: Tue, 11 Feb 2025 19:15:36 +0100
Subject: [PATCH v5 2/3] Bump postmaster soft open file limit (RLIMIT_NOFILE)
 when necessary

The default open file limit of 1024 on Linux is extremely low. The
reason that this hasn't changed change is because doing so would break
legacy programs that use the select(2) system call in hard to debug
ways. So instead programs that want to opt-in to a higher open file
limit are expected to bump their soft limit to their hard limit on
startup. Details on this are very well explained in a blogpost by the
systemd author[1]. There's also a similar change done by the Go
language[2].

This starts bumping postmaster its soft open file limit when we realize
that we'll run into the soft limit with the requested
max_files_per_process GUC. We do so by slightly changing the meaning of
the max_files_per_process GUC. The actual (not publicly exposed) limit
is max_safe_fds, previously this would be set to:
max_files_per_process - already_open_files - NUM_RESERVED_FDS
After this change we now try to set max_safe_fds to
max_files_per_process if the system allows that. This is deemed more
natural to understand for users, because now the limit of files that
they can open is actually what they configured in max_files_per_process.

Adding this infrastructure to change RLIMIT_NOFILE when needed is
especially useful for the AIO work that Andres is doing, because
io_uring consumes a lot of file descriptors. Even without looking at AIO
there is a large number of reports from people that require changing
their soft file limit before starting Postgres, sometimes falling back
to lowering max_files_per_process when they fail to do so[3-8]. It's
also not all that strange to fail at setting the soft open file limit
because there are multiple places where one can configure such limits
and usually only one of them is effective (which one depends on how
Postgres is started). In cloud environments its also often not possible
for user to change the soft limit, because they don't control the way
that Postgres is started.

One thing to note is that we temporarily restore the original soft
limit when shell-ing out to other executables. This is done as a
precaution in case those executables are using select(2).

[1]: https://0pointer.net/blog/file-descriptor-limits.html
[2]: https://github.com/golang/go/issues/46279
[3]: https://serverfault.com/questions/785330/getting-too-many-open-files-error-for-postgres
[4]: https://serverfault.com/questions/716982/how-to-raise-max-no-of-file-descriptors-for-daemons-running-on-debian-jessie
[5]: https://www.postgresql.org/message-id/flat/CAKtc8vXh7NvP_qWj8EqqorPY97bvxSaX3h5u7a9PptRFHW5x7g%40mail.gmail.com
[6]: https://www.postgresql.org/message-id/flat/113ce31b0908120955w77029099i7ececc053084095a%40mail.gmail.com
[7]: https://github.com/abiosoft/colima/discussions/836
[8]: https://www.postgresql.org/message-id/flat/29663.1007738957%40sss.pgh.pa.us#2079ec9e2d8b251593812a3711bfe9e9
---
 doc/src/sgml/config.sgml         |  17 ++-
 src/backend/postmaster/startup.c |   3 +
 src/backend/storage/file/fd.c    | 208 +++++++++++++++++++++++++++----
 src/include/storage/fd.h         |   2 +
 4 files changed, 199 insertions(+), 31 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 007746a4429..c69522be67a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2378,15 +2378,14 @@ include_dir 'conf.d'
       </term>
       <listitem>
        <para>
-        Sets the maximum number of simultaneously open files allowed to each
-        server subprocess. The default is one thousand files. If the kernel is enforcing
-        a safe per-process limit, you don't need to worry about this setting.
-        But on some platforms (notably, most BSD systems), the kernel will
-        allow individual processes to open many more files than the system
-        can actually support if many processes all try to open
-        that many files. If you find yourself seeing <quote>Too many open
-        files</quote> failures, try reducing this setting.
-        This parameter can only be set at server start.
+        Sets the maximum number of files to each server subprocess is allowed
+        to open. The default is one thousand files. If the kernel is enforcing
+        a lower soft limit Postgres will automatically increase it if the hard
+        limit allows that. Setting this value too high can cause resource
+        exhaustion problems: On some platforms (notably, most BSD systems), the
+        kernel will allow individual processes to open many more files than the
+        system can actually support if many processes all try to open that many
+        files. This parameter can only be set at server start.
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index b0cd40c081e..d3a5e2590ab 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -30,6 +30,7 @@
 #include "storage/pmsignal.h"
 #include "storage/procsignal.h"
 #include "storage/standby.h"
+#include "storage/fd.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/timeout.h"
@@ -274,6 +275,7 @@ System(const char *command, uint32 wait_event_info)
 {
 	int			rc;
 
+	RestoreOriginalOpenFileLimit();
 	fflush(NULL);
 	pgstat_report_wait_start(wait_event_info);
 
@@ -303,6 +305,7 @@ System(const char *command, uint32 wait_event_info)
 		in_restore_command = false;
 
 	pgstat_report_wait_end();
+	RestoreCustomOpenFileLimit();
 	return rc;
 }
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index e454db4c020..f665422692d 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -157,6 +157,13 @@ int			max_files_per_process = 1000;
  */
 int			max_safe_fds = FD_MINFREE;	/* default if not changed */
 
+#ifdef HAVE_GETRLIMIT
+static bool saved_original_max_open_files;
+static struct rlimit original_max_open_files;
+static struct rlimit custom_max_open_files;
+#endif
+
+
 /* Whether it is safe to continue running after fsync() fails. */
 bool		data_sync_retry = false;
 
@@ -945,6 +952,154 @@ InitTemporaryFileAccess(void)
 #endif
 }
 
+/*
+ * Returns true if the passed in highestfd is the last one that we're allowed
+ * to open based on our. This should only be called if
+ */
+static bool
+IsOpenFileLimit(int highestfd)
+{
+#ifdef HAVE_GETRLIMIT
+	if (!saved_original_max_open_files)
+	{
+		return false;
+	}
+
+	return highestfd >= custom_max_open_files.rlim_cur - 1;
+#else
+	return false;
+#endif
+}
+
+/*
+ * Increases the open file limit (RLIMIT_NOFILE) by the requested amount.
+ * Returns true if successful, false otherwise.
+ */
+static bool
+IncreaseOpenFileLimit(int extra_files)
+{
+#ifdef HAVE_GETRLIMIT
+	struct rlimit rlim;
+
+	if (!saved_original_max_open_files)
+	{
+		return false;
+	}
+
+	rlim = custom_max_open_files;
+
+	/* If we're already at the max we reached our limit */
+	if (rlim.rlim_cur == original_max_open_files.rlim_max)
+		return false;
+
+	/* Otherwise try to increase the soft limit to what we need */
+	rlim.rlim_cur = Min(rlim.rlim_cur + extra_files, rlim.rlim_max);
+
+	if (setrlimit(RLIMIT_NOFILE, &rlim) != 0)
+	{
+		/* We made sure not to exceed the hard limit, so this shouldn't fail */
+		ereport(WARNING, (errmsg("setrlimit failed: %m")));
+		return false;
+	}
+
+	custom_max_open_files = rlim;
+
+	elog(LOG, "increased open file limit to %ld", (long) rlim.rlim_cur);
+
+	return true;
+#else
+	return false;
+#endif
+}
+
+/*
+ * Saves the original open file limit (RLIMIT_NOFILE) the first time when this
+ * is called. If called again it's a no-op.
+ *
+ * Returns true if successful, false otherwise.
+ */
+static void
+SaveOriginalOpenFileLimit(void)
+{
+#ifdef HAVE_GETRLIMIT
+	int			status;
+
+	if (saved_original_max_open_files)
+	{
+		/* Already saved, no need to do it again */
+		return;
+	}
+
+	status = getrlimit(RLIMIT_NOFILE, &original_max_open_files);
+	if (status != 0)
+	{
+		ereport(WARNING, (errmsg("getrlimit failed: %m")));
+		return;
+	}
+
+	custom_max_open_files = original_max_open_files;
+	saved_original_max_open_files = true;
+	return;
+#else
+	return;
+#endif
+}
+
+/*
+ * RestoreOriginalOpenFileLimit --- Restore the original open file limit that
+ * 		was present at postmaster start.
+ *
+ * This should be called before spawning subprocesses that might use select(2)
+ * which can only handle file descriptors up to 1024.
+ */
+void
+RestoreOriginalOpenFileLimit(void)
+{
+#ifdef HAVE_GETRLIMIT
+	if (!saved_original_max_open_files)
+	{
+		return;
+	}
+
+	if (custom_max_open_files.rlim_cur == original_max_open_files.rlim_cur)
+	{
+		/* Not changed, so no need to call setrlimit at all */
+		return;
+	}
+
+	if (setrlimit(RLIMIT_NOFILE, &original_max_open_files) != 0)
+	{
+		ereport(WARNING, (errmsg("setrlimit failed: %m")));
+	}
+#endif
+}
+
+/*
+ * RestoreCustomOpenFileLimit --- Restores our custom open file limit after a
+ * 		previous call to RestoreOriginalOpenFileLimit.
+ */
+void
+RestoreCustomOpenFileLimit(void)
+{
+#ifdef HAVE_GETRLIMIT
+	if (!saved_original_max_open_files)
+	{
+		return;
+	}
+
+	if (custom_max_open_files.rlim_cur == original_max_open_files.rlim_cur)
+	{
+		/* Not changed, so no need to call setrlimit at all */
+		return;
+	}
+
+	if (setrlimit(RLIMIT_NOFILE, &custom_max_open_files) != 0)
+	{
+		ereport(WARNING, (errmsg("setrlimit failed: %m")));
+	}
+#endif
+}
+
 /*
  * count_usable_fds --- count how many FDs the system will let us open,
  *		and estimate how many are already open.
@@ -968,38 +1123,39 @@ count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
 	int			highestfd = 0;
 	int			j;
 
-#ifdef HAVE_GETRLIMIT
-	struct rlimit rlim;
-	int			getrlimit_status;
-#endif
-
 	size = 1024;
 	fd = (int *) palloc(size * sizeof(int));
 
-#ifdef HAVE_GETRLIMIT
-	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
-	if (getrlimit_status != 0)
-		ereport(WARNING, (errmsg("getrlimit failed: %m")));
-#endif							/* HAVE_GETRLIMIT */
+	SaveOriginalOpenFileLimit();
 
 	/* dup until failure or probe limit reached */
 	for (;;)
 	{
 		int			thisfd;
 
-#ifdef HAVE_GETRLIMIT
-
 		/*
 		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
 		 * some platforms
 		 */
-		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
-			break;
-#endif
+		if (IsOpenFileLimit(highestfd))
+		{
+			if (!IncreaseOpenFileLimit(max_to_probe - used))
+				break;
+		}
 
 		thisfd = dup(2);
 		if (thisfd < 0)
 		{
+			/*
+			 * Eventhough we do the pre-check above, it's still possible that
+			 * the call to dup fails with EMFILE. This can happen if the last
+			 * file descriptor was already assigned to an "already open" file.
+			 * One example of this happening, is if we're already at the soft
+			 * limit when we call count_usable_fds.
+			 */
+			if (errno == EMFILE && IncreaseOpenFileLimit(max_to_probe - used))
+				continue;
+
 			/* Expect EMFILE or ENFILE, else it's fishy */
 			if (errno != EMFILE && errno != ENFILE)
 				elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
@@ -1053,15 +1209,10 @@ set_max_safe_fds(void)
 	 * or the experimentally-determined EMFILE limit.
 	 *----------
 	 */
-	count_usable_fds(max_files_per_process,
+	count_usable_fds(max_files_per_process + NUM_RESERVED_FDS,
 					 &usable_fds, &already_open);
 
-	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
-
-	/*
-	 * Take off the FDs reserved for system() etc.
-	 */
-	max_safe_fds -= NUM_RESERVED_FDS;
+	max_safe_fds = Min(usable_fds - NUM_RESERVED_FDS, max_files_per_process);
 
 	/*
 	 * Make sure we still have enough to get by.
@@ -2724,6 +2875,19 @@ OpenPipeStream(const char *command, const char *mode)
 	ReleaseLruFiles();
 
 TryAgain:
+
+	/*
+	 * It would be great if we could call RestoreOriginalOpenFileLimit here,
+	 * but since popen() also opens a file in the current process (this side
+	 * of the pipe) we cannot do so safely. Because we might already have many
+	 * more files open than the original limit.
+	 *
+	 * The only way to address this would be implementing a custom popen()
+	 * that calls RestoreOriginalOpenFileLimit only around the actual fork
+	 * call, but that seems too much effort to handle the corner case where
+	 * this external command uses both select() and tries to open more files
+	 * than select() allows for.
+	 */
 	fflush(NULL);
 	pqsignal(SIGPIPE, SIG_DFL);
 	errno = 0;
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index e3067ab6597..d24e7f1c8df 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -177,6 +177,8 @@ extern void RemovePgTempFiles(void);
 extern void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
 								   bool unlink_all);
 extern bool looks_like_temp_rel_name(const char *name);
+extern void RestoreOriginalOpenFileLimit(void);
+extern void RestoreCustomOpenFileLimit(void);
 
 extern int	pg_fsync(int fd);
 extern int	pg_fsync_no_writethrough(int fd);
-- 
2.43.0

From ff352f09fa72d6e359be6bf20a215c3724be2303 Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <github-t...@jeltef.nl>
Date: Wed, 12 Feb 2025 01:08:07 +0100
Subject: [PATCH v5 3/3] Reflect the value of max_safe_fds in
 max_files_per_process

It is currently hard to figure out if max_safe_fds is significantly
lower than max_files_per_process. This starts reflecting the value of
max_safe_fds in max_files_per_process after our limit detection. We
still want to have two separate variables because for the bootstrap or
standalone-backend cases their values differ on purpose.
---
 src/backend/storage/file/fd.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index f665422692d..03a16e8e5b6 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1200,6 +1200,7 @@ set_max_safe_fds(void)
 {
 	int			usable_fds;
 	int			already_open;
+	char	   *max_safe_fds_string;
 
 	/*----------
 	 * We want to set max_safe_fds to
@@ -1214,6 +1215,12 @@ set_max_safe_fds(void)
 
 	max_safe_fds = Min(usable_fds - NUM_RESERVED_FDS, max_files_per_process);
 
+	/* Update GUC variable to allow users to see the result */
+	max_safe_fds_string = psprintf("%d", max_safe_fds);
+	SetConfigOption("max_files_per_process", max_safe_fds_string,
+					PGC_POSTMASTER, PGC_S_OVERRIDE);
+	pfree(max_safe_fds_string);
+
 	/*
 	 * Make sure we still have enough to get by.
 	 */
-- 
2.43.0

Re: Bump soft open file limit (RLIMIT_NOFILE) to hard limit on startup

Reply via email to