I wrote:
> Here's a draft patch that does it like that.

On reflection, trying to make ReserveExternalFD serve two different
use-cases was pretty messy.  Here's a version that splits it into two
functions.  I also took the trouble to fix dblink.

                        regards, tom lane

diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c
index c1155e3..a5f69fc 100644
--- a/contrib/dblink/dblink.c
+++ b/contrib/dblink/dblink.c
@@ -200,12 +200,27 @@ dblink_get_conn(char *conname_or_str,
 		if (connstr == NULL)
 			connstr = conname_or_str;
 		dblink_connstr_check(connstr);
+
+		/*
+		 * We must obey fd.c's limit on non-virtual file descriptors.  Assume
+		 * that a PGconn represents one long-lived FD.  (Doing this here also
+		 * ensures that VFDs are closed if needed to make room.)
+		 */
+		if (!AcquireExternalFD())
+			ereport(ERROR,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+					 errmsg("could not establish connection"),
+					 errdetail("There are too many open files.")));
+
+		/* OK to make connection */
 		conn = PQconnectdb(connstr);
+
 		if (PQstatus(conn) == CONNECTION_BAD)
 		{
 			char	   *msg = pchomp(PQerrorMessage(conn));
 
 			PQfinish(conn);
+			ReleaseExternalFD();
 			ereport(ERROR,
 					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 					 errmsg("could not establish connection"),
@@ -282,12 +297,26 @@ dblink_connect(PG_FUNCTION_ARGS)
 
 	/* check password in connection string if not superuser */
 	dblink_connstr_check(connstr);
+
+	/*
+	 * We must obey fd.c's limit on non-virtual file descriptors.  Assume that
+	 * a PGconn represents one long-lived FD.  (Doing this here also ensures
+	 * that VFDs are closed if needed to make room.)
+	 */
+	if (!AcquireExternalFD())
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg("could not establish connection"),
+				 errdetail("There are too many open files.")));
+
+	/* OK to make connection */
 	conn = PQconnectdb(connstr);
 
 	if (PQstatus(conn) == CONNECTION_BAD)
 	{
 		msg = pchomp(PQerrorMessage(conn));
 		PQfinish(conn);
+		ReleaseExternalFD();
 		if (rconn)
 			pfree(rconn);
 
@@ -312,7 +341,10 @@ dblink_connect(PG_FUNCTION_ARGS)
 	else
 	{
 		if (pconn->conn)
+		{
 			PQfinish(pconn->conn);
+			ReleaseExternalFD();
+		}
 		pconn->conn = conn;
 	}
 
@@ -346,6 +378,7 @@ dblink_disconnect(PG_FUNCTION_ARGS)
 		dblink_conn_not_avail(conname);
 
 	PQfinish(conn);
+	ReleaseExternalFD();
 	if (rconn)
 	{
 		deleteConnection(conname);
@@ -780,7 +813,10 @@ dblink_record_internal(FunctionCallInfo fcinfo, bool is_async)
 	{
 		/* if needed, close the connection to the database */
 		if (freeconn)
+		{
 			PQfinish(conn);
+			ReleaseExternalFD();
+		}
 	}
 	PG_END_TRY();
 
@@ -1458,7 +1494,10 @@ dblink_exec(PG_FUNCTION_ARGS)
 	{
 		/* if needed, close the connection to the database */
 		if (freeconn)
+		{
 			PQfinish(conn);
+			ReleaseExternalFD();
+		}
 	}
 	PG_END_TRY();
 
@@ -2563,6 +2602,7 @@ createNewConnection(const char *name, remoteConn *rconn)
 	if (found)
 	{
 		PQfinish(rconn->conn);
+		ReleaseExternalFD();
 		pfree(rconn);
 
 		ereport(ERROR,
@@ -2604,6 +2644,7 @@ dblink_security_check(PGconn *conn, remoteConn *rconn)
 		if (!PQconnectionUsedPassword(conn))
 		{
 			PQfinish(conn);
+			ReleaseExternalFD();
 			if (rconn)
 				pfree(rconn);
 
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 29c811a..74576d7 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -20,6 +20,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postgres_fdw.h"
+#include "storage/fd.h"
 #include "storage/latch.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
@@ -259,10 +260,27 @@ connect_pg_server(ForeignServer *server, UserMapping *user)
 
 		keywords[n] = values[n] = NULL;
 
-		/* verify connection parameters and make connection */
+		/* verify the set of connection parameters */
 		check_conn_params(keywords, values, user);
 
+		/*
+		 * We must obey fd.c's limit on non-virtual file descriptors.  Assume
+		 * that a PGconn represents one long-lived FD.  (Doing this here also
+		 * ensures that VFDs are closed if needed to make room.)
+		 */
+		if (!AcquireExternalFD())
+			ereport(ERROR,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+					 errmsg("could not connect to server \"%s\"",
+							server->servername),
+					 errdetail("There are too many open files.")));
+
+		/* OK to make connection */
 		conn = PQconnectdbParams(keywords, values, false);
+
+		if (!conn)
+			ReleaseExternalFD();	/* because the PG_CATCH block won't */
+
 		if (!conn || PQstatus(conn) != CONNECTION_OK)
 			ereport(ERROR,
 					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
@@ -294,7 +312,10 @@ connect_pg_server(ForeignServer *server, UserMapping *user)
 	{
 		/* Release PGconn data structure if we managed to create one */
 		if (conn)
+		{
 			PQfinish(conn);
+			ReleaseExternalFD();
+		}
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -312,6 +333,7 @@ disconnect_pg_server(ConnCacheEntry *entry)
 	{
 		PQfinish(entry->conn);
 		entry->conn = NULL;
+		ReleaseExternalFD();
 	}
 }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fd527f2..d19408b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -774,6 +774,7 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea
  * openLogFile is -1 or a kernel FD for an open log file segment.
  * openLogSegNo identifies the segment.  These variables are only used to
  * write the XLOG, and so will normally refer to the active segment.
+ * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
  */
 static int	openLogFile = -1;
 static XLogSegNo openLogSegNo = 0;
@@ -785,6 +786,9 @@ static XLogSegNo openLogSegNo = 0;
  * will be just past that page. readLen indicates how much of the current
  * page has been read into readBuf, and readSource indicates where we got
  * the currently open file from.
+ * Note: we could use Reserve/ReleaseExternalFD to track consumption of
+ * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
+ * not read by general-purpose sessions.
  */
 static int	readFile = -1;
 static XLogSegNo readSegNo = 0;
@@ -2447,6 +2451,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			/* create/use new log file */
 			use_existent = true;
 			openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
+			ReserveExternalFD();
 		}
 
 		/* Make sure we have the current logfile open */
@@ -2455,6 +2460,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 							wal_segment_size);
 			openLogFile = XLogFileOpen(openLogSegNo);
+			ReserveExternalFD();
 		}
 
 		/* Add current page to the set of pending pages-to-dump */
@@ -2605,6 +2611,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 				XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
 								wal_segment_size);
 				openLogFile = XLogFileOpen(openLogSegNo);
+				ReserveExternalFD();
 			}
 
 			issue_xlog_fsync(openLogFile, openLogSegNo);
@@ -3811,6 +3818,7 @@ XLogFileClose(void)
 	}
 
 	openLogFile = -1;
+	ReleaseExternalFD();
 }
 
 /*
@@ -5224,6 +5232,11 @@ BootStrapXLOG(void)
 	use_existent = false;
 	openLogFile = XLogFileInit(1, &use_existent, false);
 
+	/*
+	 * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
+	 * close the file again in a moment.
+	 */
+
 	/* Write the first page with the initial record */
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 59dc4f3..462b4d7 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -602,6 +602,9 @@ retry2:
 
 	pg_freeaddrinfo_all(hints.ai_family, addrs);
 
+	/* Now that we have a long-lived socket, tell fd.c about it. */
+	ReserveExternalFD();
+
 	return;
 
 startup_failed:
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b3986be..cd61665 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2554,9 +2554,14 @@ ClosePostmasterPorts(bool am_syslogger)
 				(errcode_for_file_access(),
 				 errmsg_internal("could not close postmaster death monitoring pipe in child process: %m")));
 	postmaster_alive_fds[POSTMASTER_FD_OWN] = -1;
+	/* Notify fd.c that we released one pipe FD. */
+	ReleaseExternalFD();
 #endif
 
-	/* Close the listen sockets */
+	/*
+	 * Close the postmaster's listen sockets.  These aren't tracked by fd.c,
+	 * so we don't call ReleaseExternalFD() here.
+	 */
 	for (i = 0; i < MAXLISTEN; i++)
 	{
 		if (ListenSocket[i] != PGINVALID_SOCKET)
@@ -2566,7 +2571,10 @@ ClosePostmasterPorts(bool am_syslogger)
 		}
 	}
 
-	/* If using syslogger, close the read side of the pipe */
+	/*
+	 * If using syslogger, close the read side of the pipe.  We don't bother
+	 * tracking this in fd.c, either.
+	 */
 	if (!am_syslogger)
 	{
 #ifndef WIN32
@@ -4279,6 +4287,9 @@ BackendInitialize(Port *port)
 	/* Save port etc. for ps status */
 	MyProcPort = port;
 
+	/* Tell fd.c about the long-lived FD associated with the port */
+	ReserveExternalFD();
+
 	/*
 	 * PreAuthDelay is a debugging aid for investigating problems in the
 	 * authentication cycle: it can be set in postgresql.conf to allow time to
@@ -6442,6 +6453,20 @@ restore_backend_variables(BackendParameters *param, Port *port)
 	strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH);
 
 	strlcpy(ExtraOptions, param->ExtraOptions, MAXPGPATH);
+
+	/*
+	 * We need to restore fd.c's counts of externally-opened FDs; to avoid
+	 * confusion, be sure to do this after restoring max_safe_fds.  (Note:
+	 * BackendInitialize will handle this for port->sock.)
+	 */
+#ifndef WIN32
+	if (postmaster_alive_fds[0] >= 0)
+		ReserveExternalFD();
+	if (postmaster_alive_fds[1] >= 0)
+		ReserveExternalFD();
+#endif
+	if (pgStatSock != PGINVALID_SOCKET)
+		ReserveExternalFD();
 }
 
 
@@ -6584,6 +6609,10 @@ InitPostmasterDeathWatchHandle(void)
 				(errcode_for_file_access(),
 				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
 
+	/* Notify fd.c that we've eaten two FDs for the pipe. */
+	ReserveExternalFD();
+	ReserveExternalFD();
+
 	/*
 	 * Set O_NONBLOCK to allow testing for the fd's presence with a read()
 	 * call.
diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c
index b2b69a7..cf7b535 100644
--- a/src/backend/postmaster/syslogger.c
+++ b/src/backend/postmaster/syslogger.c
@@ -562,6 +562,11 @@ SysLogger_Start(void)
 	 * This means the postmaster must continue to hold the read end of the
 	 * pipe open, so we can pass it down to the reincarnated syslogger. This
 	 * is a bit klugy but we have little choice.
+	 *
+	 * Also note that we don't bother counting the pipe FDs by calling
+	 * Reserve/ReleaseExternalFD.  There's no real need to account for them
+	 * accurately in the postmaster or syslogger process, and both ends of the
+	 * pipe will wind up closed in all other postmaster children.
 	 */
 #ifndef WIN32
 	if (syslogPipe[0] < 0)
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index b5f4df6..34f7443 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -61,6 +61,12 @@
  * BasicOpenFile, it is solely the caller's responsibility to close the file
  * descriptor by calling close(2).
  *
+ * If a non-virtual file descriptor needs to be held open for any length of
+ * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
+ * (and eventually ReleaseExternalFD), so that we can take it into account
+ * while deciding how many VFDs can be open.  This applies to FDs obtained
+ * with BasicOpenFile as well as those obtained without use of any fd.c API.
+ *
  *-------------------------------------------------------------------------
  */
 
@@ -103,8 +109,8 @@
 /*
  * We must leave some file descriptors free for system(), the dynamic loader,
  * and other code that tries to open files without consulting fd.c.  This
- * is the number left free.  (While we can be pretty sure we won't get
- * EMFILE, there's never any guarantee that we won't get ENFILE due to
+ * is the number left free.  (While we try fairly hard to prevent EMFILE
+ * errors, there's never any guarantee that we won't get ENFILE due to
  * other processes chewing up FDs.  So it's a bad idea to try to open files
  * without consulting fd.c.  Nonetheless we cannot control all code.)
  *
@@ -119,9 +125,12 @@
 
 /*
  * If we have fewer than this many usable FDs after allowing for the reserved
- * ones, choke.
+ * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
+ * much less than that.  Note that this value ensures numExternalFDs can be
+ * at least 16; as of this writing, the contrib/postgres_fdw regression tests
+ * will not pass unless that can grow to at least 14.)
  */
-#define FD_MINFREE				10
+#define FD_MINFREE				48
 
 /*
  * A number of platforms allow individual processes to open many more files
@@ -132,8 +141,8 @@
 int			max_files_per_process = 1000;
 
 /*
- * Maximum number of file descriptors to open for either VFD entries or
- * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
+ * Maximum number of file descriptors to open for operations that fd.c knows
+ * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
  * to a conservative value, and remains that way indefinitely in bootstrap or
  * standalone-backend cases.  In normal postmaster operation, the postmaster
  * calls set_max_safe_fds() late in initialization to update the value, and
@@ -142,7 +151,7 @@ int			max_files_per_process = 1000;
  * Note: the value of max_files_per_process is taken into account while
  * setting this variable, and so need not be tested separately.
  */
-int			max_safe_fds = 32;	/* default if not changed */
+int			max_safe_fds = FD_MINFREE;	/* default if not changed */
 
 /* Whether it is safe to continue running after fsync() fails. */
 bool		data_sync_retry = false;
@@ -244,6 +253,11 @@ static int	maxAllocatedDescs = 0;
 static AllocateDesc *allocatedDescs = NULL;
 
 /*
+ * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
+ */
+static int	numExternalFDs = 0;
+
+/*
  * Number of temporary files opened during the current session;
  * this is used in generation of tempfile names.
  */
@@ -1025,6 +1039,80 @@ tryAgain:
 	return -1;					/* failure */
 }
 
+/*
+ * AcquireExternalFD - attempt to reserve an external file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.
+ *
+ * The difference between this and the underlying ReserveExternalFD function
+ * is that this will report failure (by setting errno and returning false)
+ * if "too many" external FDs are already reserved.  This should be used in
+ * any code where the total number of FDs to be reserved is not predictable
+ * and small.
+ */
+bool
+AcquireExternalFD(void)
+{
+	/*
+	 * We don't want more than max_safe_fds / 3 FDs to be consumed for
+	 * "external" FDs.
+	 */
+	if (numExternalFDs < max_safe_fds / 3)
+	{
+		ReserveExternalFD();
+		return true;
+	}
+	errno = EMFILE;
+	return false;
+}
+
+/*
+ * ReserveExternalFD - report external consumption of a file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.  This just tracks the use of the FD and closes
+ * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
+ *
+ * Call this directly only in code where failure to reserve the FD would be
+ * fatal; for example, the WAL-writing code does so, since the alternative is
+ * session failure.  Also, it's very unwise to do so in code that could
+ * consume more than one FD per process.
+ *
+ * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
+ * available, it doesn't matter too much whether this is called before or
+ * after actually opening the FD; but doing so beforehand reduces the risk of
+ * an EMFILE failure if not everybody played nice.  In any case, it's solely
+ * caller's responsibility to keep the external-FD count in sync with reality.
+ */
+void
+ReserveExternalFD(void)
+{
+	/*
+	 * Release VFDs if needed to stay safe.  Because we do this before
+	 * incrementing numExternalFDs, the final state will be as desired, i.e.,
+	 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
+	 */
+	ReleaseLruFiles();
+
+	numExternalFDs++;
+}
+
+/*
+ * ReleaseExternalFD - report release of an external file descriptor
+ *
+ * This is guaranteed not to change errno, so it can be used in failure paths.
+ */
+void
+ReleaseExternalFD(void)
+{
+	Assert(numExternalFDs > 0);
+	numExternalFDs--;
+}
+
+
 #if defined(FDDEBUG)
 
 static void
@@ -1185,7 +1273,7 @@ ReleaseLruFile(void)
 static void
 ReleaseLruFiles(void)
 {
-	while (nfile + numAllocatedDescs >= max_safe_fds)
+	while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
 	{
 		if (!ReleaseLruFile())
 			break;
@@ -2176,13 +2264,13 @@ reserveAllocatedDesc(void)
 
 	/*
 	 * If the array hasn't yet been created in the current process, initialize
-	 * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
+	 * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
 	 * we will ever need, anyway.  We don't want to look at max_safe_fds
 	 * immediately because set_max_safe_fds() may not have run yet.
 	 */
 	if (allocatedDescs == NULL)
 	{
-		newMax = FD_MINFREE / 2;
+		newMax = FD_MINFREE / 3;
 		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
 		/* Out of memory already?  Treat as fatal error. */
 		if (newDescs == NULL)
@@ -2200,10 +2288,12 @@ reserveAllocatedDesc(void)
 	 *
 	 * We mustn't let allocated descriptors hog all the available FDs, and in
 	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
-	 * set the maximum to max_safe_fds / 2.  (This should certainly be at
-	 * least as large as the initial size, FD_MINFREE / 2.)
+	 * set the maximum to max_safe_fds / 3.  (This should certainly be at
+	 * least as large as the initial size, FD_MINFREE / 3, so we aren't
+	 * tightening the restriction here.)  Recall that "external" FDs are
+	 * allowed to consume another third of max_safe_fds.
 	 */
-	newMax = max_safe_fds / 2;
+	newMax = max_safe_fds / 3;
 	if (newMax > maxAllocatedDescs)
 	{
 		newDescs = (AllocateDesc *) realloc(allocatedDescs,
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 138bdec..1972aec 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -247,14 +247,17 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
 	/*
 	 * Create new segment or open an existing one for attach.
 	 *
-	 * Even though we're not going through fd.c, we should be safe against
-	 * running out of file descriptors, because of NUM_RESERVED_FDS.  We're
-	 * only opening one extra descriptor here, and we'll close it before
-	 * returning.
+	 * Even though we will close the FD before returning, it seems desirable
+	 * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
+	 * failure.  The fact that we won't hold the FD open long justifies using
+	 * ReserveExternalFD rather than AcquireExternalFD, though.
 	 */
+	ReserveExternalFD();
+
 	flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
 	if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
 	{
+		ReleaseExternalFD();
 		if (errno != EEXIST)
 			ereport(elevel,
 					(errcode_for_dynamic_shared_memory(),
@@ -278,6 +281,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
 			/* Back out what's already been done. */
 			save_errno = errno;
 			close(fd);
+			ReleaseExternalFD();
 			errno = save_errno;
 
 			ereport(elevel,
@@ -295,6 +299,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
 		/* Back out what's already been done. */
 		save_errno = errno;
 		close(fd);
+		ReleaseExternalFD();
 		shm_unlink(name);
 		errno = save_errno;
 
@@ -323,6 +328,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
 		/* Back out what's already been done. */
 		save_errno = errno;
 		close(fd);
+		ReleaseExternalFD();
 		if (op == DSM_OP_CREATE)
 			shm_unlink(name);
 		errno = save_errno;
@@ -336,6 +342,7 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
 	*mapped_address = address;
 	*mapped_size = request_size;
 	close(fd);
+	ReleaseExternalFD();
 
 	return true;
 }
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index cbd4952..046ca5c 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -51,6 +51,7 @@
 #include "port/atomics.h"
 #include "portability/instr_time.h"
 #include "postmaster/postmaster.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
@@ -187,6 +188,9 @@ InitializeLatchSupport(void)
 			/* Clean up, just for safety's sake; we'll set these below */
 			selfpipe_readfd = selfpipe_writefd = -1;
 			selfpipe_owner_pid = 0;
+			/* Keep fd.c's accounting straight */
+			ReleaseExternalFD();
+			ReleaseExternalFD();
 		}
 		else
 		{
@@ -194,6 +198,7 @@ InitializeLatchSupport(void)
 			 * Postmaster didn't create a self-pipe ... or else we're in an
 			 * EXEC_BACKEND build, in which case it doesn't matter since the
 			 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
+			 * fd.c won't have state to clean up, either.
 			 */
 			Assert(selfpipe_readfd == -1);
 		}
@@ -228,6 +233,10 @@ InitializeLatchSupport(void)
 	selfpipe_readfd = pipefd[0];
 	selfpipe_writefd = pipefd[1];
 	selfpipe_owner_pid = MyProcPid;
+
+	/* Tell fd.c about these two long-lived FDs */
+	ReserveExternalFD();
+	ReserveExternalFD();
 #else
 	/* currently, nothing to do here for Windows */
 #endif
@@ -604,24 +613,57 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 	set->exit_on_postmaster_death = false;
 
 #if defined(WAIT_USE_EPOLL)
+	if (!AcquireExternalFD())
+	{
+		/* treat this as though epoll_create1 itself returned EMFILE */
+		elog(ERROR, "epoll_create1 failed: %m");
+	}
 #ifdef EPOLL_CLOEXEC
 	set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 	if (set->epoll_fd < 0)
+	{
+		ReleaseExternalFD();
 		elog(ERROR, "epoll_create1 failed: %m");
+	}
 #else
 	/* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
 	set->epoll_fd = epoll_create(nevents);
 	if (set->epoll_fd < 0)
+	{
+		ReleaseExternalFD();
 		elog(ERROR, "epoll_create failed: %m");
+	}
 	if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
+	{
+		int			save_errno = errno;
+
+		close(set->epoll_fd);
+		ReleaseExternalFD();
+		errno = save_errno;
 		elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
+	}
 #endif							/* EPOLL_CLOEXEC */
 #elif defined(WAIT_USE_KQUEUE)
+	if (!AcquireExternalFD())
+	{
+		/* treat this as though kqueue itself returned EMFILE */
+		elog(ERROR, "kqueue failed: %m");
+	}
 	set->kqueue_fd = kqueue();
 	if (set->kqueue_fd < 0)
+	{
+		ReleaseExternalFD();
 		elog(ERROR, "kqueue failed: %m");
+	}
 	if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+	{
+		int			save_errno = errno;
+
+		close(set->kqueue_fd);
+		ReleaseExternalFD();
+		errno = save_errno;
 		elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
+	}
 	set->report_postmaster_not_running = false;
 #elif defined(WAIT_USE_WIN32)
 
@@ -655,8 +697,10 @@ FreeWaitEventSet(WaitEventSet *set)
 {
 #if defined(WAIT_USE_EPOLL)
 	close(set->epoll_fd);
+	ReleaseExternalFD();
 #elif defined(WAIT_USE_KQUEUE)
 	close(set->kqueue_fd);
+	ReleaseExternalFD();
 #elif defined(WAIT_USE_WIN32)
 	WaitEvent  *cur_event;
 
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 51e2ece..2085c62 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -35,6 +35,10 @@
  * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate
  * open directories (DIR*), and OpenTransientFile/CloseTransientFile for an
  * unbuffered file descriptor.
+ *
+ * If you really can't use any of the above, at least call AcquireExternalFD
+ * or ReserveExternalFD to report any file descriptors that are held for any
+ * length of time.  Failure to do so risks unnecessary EMFILE errors.
  */
 #ifndef FD_H
 #define FD_H
@@ -120,7 +124,12 @@ extern int	CloseTransientFile(int fd);
 extern int	BasicOpenFile(const char *fileName, int fileFlags);
 extern int	BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
 
- /* Make a directory with default permissions */
+/* Use these for other cases, and also for long-lived BasicOpenFile FDs */
+extern bool AcquireExternalFD(void);
+extern void ReserveExternalFD(void);
+extern void ReleaseExternalFD(void);
+
+/* Make a directory with default permissions */
 extern int	MakePGDirectory(const char *directoryName);
 
 /* Miscellaneous support routines */

Reply via email to