I had missed one: the "watch" end of the postmaster pipe also needs FD_CLOEXEC.
From 36f8ed2406307f8b1578ae85510c5a07718e1ea8 Mon Sep 17 00:00:00 2001 From: Thomas Munro <tmu...@postgresql.org> Date: Mon, 20 Feb 2023 23:26:36 +1300 Subject: [PATCH v3 1/3] Don't leak descriptors into subprograms.
Open data and WAL files with O_CLOEXEC, from SUSv4 (POSIX.1-2008). All of our target systems have it, except Windows. Our open() implementation on Windows already has that behavior, so provide a dummy O_CLOEXEC flag on that platform. Do the same for sockets with FD_CLOEXEC. (Later commits might improve this with SOCK_CLOEXEC on systems that have it, but we'll need the FD_CLOEXEC path as a fallback anyway, so do it this way in this initial commit.) Likewise for the postmaster death pipe. This means that programs executed by COPY and archiving commands etc, will not inherit access to those descriptors. Reviewed-by: Andres Freund <and...@anarazel.de> Discussion: https://postgr.es/m/CA%2BhUKGKb6FsAdQWcRL35KJsftv%2B9zXqQbzwkfRf1i0J2e57%2BhQ%40mail.gmail.com --- src/backend/access/transam/xlog.c | 9 ++++++--- src/backend/libpq/pqcomm.c | 10 ++++++++++ src/backend/storage/file/fd.c | 2 -- src/backend/storage/smgr/md.c | 9 +++++---- src/backend/utils/init/miscinit.c | 8 ++++++++ src/include/port/win32_port.h | 7 +++++++ 6 files changed, 36 insertions(+), 9 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f9f0f6db8d..87af608d15 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2936,7 +2936,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, * Try to use existent file (checkpoint maker may have created it already) */ *added = false; - fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC | + get_sync_bit(sync_method)); if (fd < 0) { if (errno != ENOENT) @@ -3097,7 +3098,8 @@ XLogFileInit(XLogSegNo logsegno, TimeLineID logtli) return fd; /* Now open original target segment (might not be file I just made) */ - fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC | + get_sync_bit(sync_method)); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -3328,7 +3330,8 @@ XLogFileOpen(XLogSegNo segno, TimeLineID tli) XLogFilePath(path, tli, segno, wal_segment_size); - fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC | + get_sync_bit(sync_method)); if (fd < 0) ereport(PANIC, (errcode_for_file_access(), diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 864c9debe8..0ccec33af4 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -200,6 +200,16 @@ pq_init(void) (errmsg("could not set socket to nonblocking mode: %m"))); #endif +#ifndef WIN32 + + /* + * Also make sure that any subprocesses that execute a new program don't + * inherit the socket. + */ + if (fcntl(MyProcPort->sock, F_SETFD, FD_CLOEXEC) < 0) + elog(FATAL, "fcntl(F_SETFD) failed on socket: %m"); +#endif + FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, FeBeWaitSetNEvents); socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock, NULL, NULL); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 926d000f2e..933f17b398 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1033,10 +1033,8 @@ tryAgain: O_TRUNC | O_WRONLY)) == 0, "PG_O_DIRECT value collides with standard flag"); -#if defined(O_CLOEXEC) StaticAssertStmt((PG_O_DIRECT & O_CLOEXEC) == 0, "PG_O_DIRECT value collides with O_CLOEXEC"); -#endif #if defined(O_DSYNC) StaticAssertStmt((PG_O_DIRECT & O_DSYNC) == 0, "PG_O_DIRECT value collides with O_DSYNC"); diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 8da813600c..2070eba4a2 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -205,14 +205,15 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = PathNameOpenFile(path, + O_RDWR | O_CREAT | O_EXCL | PG_BINARY | O_CLOEXEC); if (fd < 0) { int save_errno = errno; if (isRedo) - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC); if (fd < 0) { /* be sure to report the error reported by create, not open */ @@ -523,7 +524,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC); if (fd < 0) { @@ -1210,7 +1211,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, fullpath = _mdfd_segpath(reln, forknum, segno); /* open the file */ - fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); + fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | O_CLOEXEC | oflags); pfree(fullpath); diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 59532bbd80..2f13a13898 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -163,6 +163,14 @@ InitPostmasterChild(void) /* Request a signal if the postmaster dies, if possible. */ PostmasterDeathSignalInit(); + + /* Child processes should not inherit the postmaster pipe. */ +#ifndef WIN32 + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFD, FD_CLOEXEC) < 0) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to FD_CLOEXEC mode: %m"))); +#endif } /* diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h index 9488195799..a8685f1520 100644 --- a/src/include/port/win32_port.h +++ b/src/include/port/win32_port.h @@ -353,6 +353,13 @@ extern int _pglstat64(const char *name, struct stat *buf); */ #define O_DSYNC 0x0080 +/* + * Our open wrapper does not create inheritable handles, so it is safe to + * ignore O_CLOEXEC. (If we were using Windows' own open(), it might be + * necessary to convert this to _O_NOINHERIT.) + */ +#define O_CLOEXEC 0 + /* * Supplement to <errno.h>. * -- 2.39.1
From 002a689ece50518095b9d0fca71d24128fe9b147 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Mon, 6 Feb 2023 13:51:19 +1300 Subject: [PATCH v3 2/3] Use accept4() to accept connections, where available. The postmaster previously accepted connections and then made extra system calls in child processes to make them non-blocking and (as of a recent commit) close-on-exit. On nearly all systems, that can be done atomically with flags to the newer accept4() variant (recently accepted by POSIX for a future revision, but around in the wild for a while now), saving a couple of system calls. The exceptions are Windows, where we didn't have to worry about that problem anyway, EXEC_BACKEND builds on Unix (used by developers only for slightly-more-like-Windows testing), and the straggler Unixes that haven't implemented it yet (at the time of writing: macOS and AIX). Suggested-by: Andres Freund <and...@anarazel.de> Discussion: https://postgr.es/m/CA%2BhUKGKb6FsAdQWcRL35KJsftv%2B9zXqQbzwkfRf1i0J2e57%2BhQ%40mail.gmail.com --- configure | 12 ++++++++++++ configure.ac | 1 + meson.build | 1 + src/backend/libpq/pqcomm.c | 34 +++++++++++++++++++++++++++------- src/include/pg_config.h.in | 4 ++++ 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/configure b/configure index e35769ea73..d06f64cdbb 100755 --- a/configure +++ b/configure @@ -16219,6 +16219,18 @@ _ACEOF # We can't use AC_REPLACE_FUNCS to replace these functions, because it # won't handle deployment target restrictions on macOS +ac_fn_c_check_decl "$LINENO" "accept4" "ac_cv_have_decl_accept4" "#include <sys/socket.h> +" +if test "x$ac_cv_have_decl_accept4" = xyes; then : + ac_have_decl=1 +else + ac_have_decl=0 +fi + +cat >>confdefs.h <<_ACEOF +#define HAVE_DECL_ACCEPT4 $ac_have_decl +_ACEOF + ac_fn_c_check_decl "$LINENO" "preadv" "ac_cv_have_decl_preadv" "#include <sys/uio.h> " if test "x$ac_cv_have_decl_preadv" = xyes; then : diff --git a/configure.ac b/configure.ac index af23c15cb2..070a0b33db 100644 --- a/configure.ac +++ b/configure.ac @@ -1843,6 +1843,7 @@ AC_CHECK_DECLS([strlcat, strlcpy, strnlen]) # We can't use AC_REPLACE_FUNCS to replace these functions, because it # won't handle deployment target restrictions on macOS +AC_CHECK_DECLS([accept4], [], [], [#include <sys/socket.h>]) AC_CHECK_DECLS([preadv], [], [AC_LIBOBJ(preadv)], [#include <sys/uio.h>]) AC_CHECK_DECLS([pwritev], [], [AC_LIBOBJ(pwritev)], [#include <sys/uio.h>]) diff --git a/meson.build b/meson.build index f534704452..3e15d6c825 100644 --- a/meson.build +++ b/meson.build @@ -2097,6 +2097,7 @@ decl_checks = [ # checking for library symbols wouldn't handle deployment target # restrictions on macOS decl_checks += [ + ['accept4', 'sys/socket.h'], ['preadv', 'sys/uio.h'], ['pwritev', 'sys/uio.h'], ] diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 0ccec33af4..70734f1a4b 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -184,6 +184,14 @@ pq_init(void) /* set up process-exit hook to close the socket */ on_proc_exit(socket_close, 0); +#ifndef WIN32 + /* + * On most systems, the socket has already been made non-blocking and + * close-on-exec by StreamConnection(). On systems that don't have + * accept4() yet, and EXEC_BACKEND builds that need the socket to survive + * exec*(), we do that separately here in the child process. + */ +#if !HAVE_DECL_ACCEPT4 || defined(EXEC_BACKEND) /* * In backends (as soon as forked) we operate the underlying socket in * nonblocking mode and use latches to implement blocking semantics if @@ -194,13 +202,9 @@ pq_init(void) * the client, which might require changing the mode again, leading to * infinite recursion. */ -#ifndef WIN32 if (!pg_set_noblock(MyProcPort->sock)) ereport(COMMERROR, (errmsg("could not set socket to nonblocking mode: %m"))); -#endif - -#ifndef WIN32 /* * Also make sure that any subprocesses that execute a new program don't @@ -208,6 +212,7 @@ pq_init(void) */ if (fcntl(MyProcPort->sock, F_SETFD, FD_CLOEXEC) < 0) elog(FATAL, "fcntl(F_SETFD) failed on socket: %m"); +#endif #endif FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, FeBeWaitSetNEvents); @@ -702,9 +707,24 @@ StreamConnection(pgsocket server_fd, Port *port) { /* accept connection and fill in the client (remote) address */ port->raddr.salen = sizeof(port->raddr.addr); - if ((port->sock = accept(server_fd, - (struct sockaddr *) &port->raddr.addr, - &port->raddr.salen)) == PGINVALID_SOCKET) + +#if HAVE_DECL_ACCEPT4 && !defined(EXEC_BACKEND) + /* + * If we have accept4(), we can avoid a couple of fcntl() calls in + * pq_init(). We can't use this optimization in EXEC_BACKEND builds, + * though, because they need the socket to survive exec(). + */ + port->sock = accept4(server_fd, + (struct sockaddr *) &port->raddr.addr, + &port->raddr.salen, + SOCK_CLOEXEC | SOCK_NONBLOCK); +#else + port->sock = accept(server_fd, + (struct sockaddr *) &port->raddr.addr, + &port->raddr.salen); +#endif + + if (port->sock == PGINVALID_SOCKET) { ereport(LOG, (errcode_for_socket_access(), diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 20c82f5979..e21d0f05f5 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -91,6 +91,10 @@ /* Define to 1 if you have the `CRYPTO_lock' function. */ #undef HAVE_CRYPTO_LOCK +/* Define to 1 if you have the declaration of `accept4', and to 0 if you + don't. */ +#undef HAVE_DECL_ACCEPT4 + /* Define to 1 if you have the declaration of `fdatasync', and to 0 if you don't. */ #undef HAVE_DECL_FDATASYNC -- 2.39.1
From abe187d253e3547ab5dabce9128b973679442baf Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Mon, 6 Feb 2023 14:54:03 +1300 Subject: [PATCH v3 3/3] Use newer client socket options where available. As a recent commit did for the server side, we can now also try to set client sockets to nonblocking and close-on-exit atomically when creating the socket. This uses flags expected in the next POSIX revision, but already widely available on common platforms. This saves a couple of fcntl() calls, and plugs a tiny window for resource leaks in multi-threaded programs that might fork-and-exec while another thread would otherwise have to run socket() and then fcntl(). Discussion: https://postgr.es/m/CA%2BhUKGKb6FsAdQWcRL35KJsftv%2B9zXqQbzwkfRf1i0J2e57%2BhQ%40mail.gmail.com --- src/interfaces/libpq/fe-connect.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 50b5df3490..953abc640f 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -2464,6 +2464,7 @@ keep_going: /* We will come back to here until there is { struct addrinfo *addr_cur = conn->addr_cur; char host_addr[NI_MAXHOST]; + int sock_type; /* * Advance to next possible host, if we've tried all of @@ -2494,7 +2495,23 @@ keep_going: /* We will come back to here until there is conn->connip = strdup(host_addr); /* Try to create the socket */ - conn->sock = socket(addr_cur->ai_family, SOCK_STREAM, 0); + sock_type = SOCK_STREAM; +#ifdef SOCK_CLOEXEC + /* + * Atomically mark close-on-exec, if possible on this + * platform, for the benefit of multi-threaded programs. + * Otherwise see below. + */ + sock_type |= SOCK_CLOEXEC; +#endif +#ifdef SOCK_NONBLOCK + /* + * We might as well skip a system call for nonblocking + * mode too, if we can. + */ + sock_type |= SOCK_NONBLOCK; +#endif + conn->sock = socket(addr_cur->ai_family, sock_type, 0); if (conn->sock == PGINVALID_SOCKET) { int errorno = SOCK_ERRNO; @@ -2540,6 +2557,7 @@ keep_going: /* We will come back to here until there is goto keep_going; } } +#ifndef SOCK_NONBLOCK if (!pg_set_noblock(conn->sock)) { libpq_append_conn_error(conn, "could not set socket to nonblocking mode: %s", @@ -2547,7 +2565,9 @@ keep_going: /* We will come back to here until there is conn->try_next_addr = true; goto keep_going; } +#endif +#ifndef SOCK_CLOEXEC #ifdef F_SETFD if (fcntl(conn->sock, F_SETFD, FD_CLOEXEC) == -1) { @@ -2557,6 +2577,7 @@ keep_going: /* We will come back to here until there is goto keep_going; } #endif /* F_SETFD */ +#endif if (addr_cur->ai_family != AF_UNIX) { -- 2.39.1