On Mon, Dec 16, 2024 at 6:00 PM Alexander Lakhin <exclus...@gmail.com> wrote:
> It turned out that OpenBSD has semmns as low as 60 (see [4])

Whenever I run into this, or my Mac requires manual ipcrm to clean up
leaked SysV kernel junk, I rebase my patch for sema_kind = 'futex'.
Here it goes.  It could be updated to support NetBSD I believe, but I
didn't try as its futex stuff came out later.

Then I remember why I didn't go anywhere with it.  It triggers a
thought loop about flipping it all around: use futexes to implement
lwlocks directly in place, and get rid of semaphores completely, but
that involves a few rabbit holes and sub-projects.  From memory:
classic r/w lock implementation on futexes is tricky but doable in the
portability constraints, futex fallback implementation even works
surprisingly well but has fun memory map sub-problems, actually lwlock
is not really a classic r/w lock as it has sprouted extra funky APIs
that lead the intrepid rabbit-holer to design an entirely different
new concurrency primitive that is really wanted for those users, a
couple of other places use raw semaphores directly namely procarray.c
and clog.c and if you stare at those for long you will be overwhelmed
with a desire to rewrite them, EOVERFLOW.
From 42054d64062da58e44a383d0ed0c1c6bb2ba88e1 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sun, 24 Oct 2021 21:48:26 +1300
Subject: [PATCH 1/3] A basic API for futexes.

A thin wrapper for basic 32 bit futex wait and wake.  Currently, it maps
to native support on Linux, DragonFlyBSD, FreeBSD, OpenBSD and macOS,
with detection via configure/meson.

NetBSD could probably be added, no investigated.  Windows'
WaitOnAddress() can't because it only works between threads.  A
latch-based backend-only fallback implementation is plausible.
---
 configure                    |   4 +-
 configure.ac                 |   5 +
 meson.build                  |   5 +
 src/backend/port/meson.build |   2 +-
 src/include/pg_config.h.in   |  15 +++
 src/include/port/pg_futex.h  | 171 +++++++++++++++++++++++++++++++++++
 6 files changed, 199 insertions(+), 3 deletions(-)
 create mode 100644 src/include/port/pg_futex.h

diff --git a/configure b/configure
index 518c33b73a9..6eb25178dab 100755
--- a/configure
+++ b/configure
@@ -13227,7 +13227,7 @@ fi
 ## Header files
 ##
 
-for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h xlocale.h
+for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h linux/futex.h mbarrier.h sys/epoll.h sys/event.h sys/futex.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h sys/umtx.h termios.h ucred.h xlocale.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -15044,7 +15044,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols copyfile copy_file_range elf_aux_info getauxval getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in __ulock_wait backtrace_symbols copyfile copy_file_range elf_aux_info getauxval getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range umtx_sleep uselocale wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 247ae97fa4c..6b4f3e0f2e5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1438,14 +1438,17 @@ AC_CHECK_HEADERS(m4_normalize([
 	execinfo.h
 	getopt.h
 	ifaddrs.h
+	linux/futex.h
 	mbarrier.h
 	sys/epoll.h
 	sys/event.h
+	sys/futex.h
 	sys/personality.h
 	sys/prctl.h
 	sys/procctl.h
 	sys/signalfd.h
 	sys/ucred.h
+	sys/umtx.h
 	termios.h
 	ucred.h
 	xlocale.h
@@ -1707,6 +1710,7 @@ LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
 AC_CHECK_FUNCS(m4_normalize([
+	__ulock_wait
 	backtrace_symbols
 	copyfile
 	copy_file_range
@@ -1727,6 +1731,7 @@ AC_CHECK_FUNCS(m4_normalize([
 	strsignal
 	syncfs
 	sync_file_range
+	umtx_sleep
 	uselocale
 	wcstombs_l
 ]))
diff --git a/meson.build b/meson.build
index e5ce437a5c7..5c9775f1a6e 100644
--- a/meson.build
+++ b/meson.build
@@ -2380,15 +2380,18 @@ header_checks = [
   'execinfo.h',
   'getopt.h',
   'ifaddrs.h',
+  'linux/futex.h',
   'mbarrier.h',
   'strings.h',
   'sys/epoll.h',
   'sys/event.h',
+  'sys/futex.h',
   'sys/personality.h',
   'sys/prctl.h',
   'sys/procctl.h',
   'sys/signalfd.h',
   'sys/ucred.h',
+  'sys/umtx.h',
   'termios.h',
   'ucred.h',
   'xlocale.h',
@@ -2611,6 +2614,7 @@ endif
 # XXX: Might be worth conditioning some checks on the OS, to avoid doing
 # unnecessary checks over and over, particularly on windows.
 func_checks = [
+  ['__ulock_wait'],
   ['backtrace_symbols', {'dependencies': [execinfo_dep]}],
   ['clock_gettime', {'dependencies': [rt_dep], 'define': false}],
   ['copyfile'],
@@ -2654,6 +2658,7 @@ func_checks = [
   ['strsignal'],
   ['sync_file_range'],
   ['syncfs'],
+  ['umtx_sleep'],
   ['uselocale'],
   ['wcstombs_l'],
 ]
diff --git a/src/backend/port/meson.build b/src/backend/port/meson.build
index 7820e86016d..e34499bafb3 100644
--- a/src/backend/port/meson.build
+++ b/src/backend/port/meson.build
@@ -5,7 +5,7 @@ backend_sources += files(
 )
 
 
-if cdata.has('USE_UNNAMED_POSIX_SEMAPHORES') or cdata.has('USE_NAMED_POSIX_SEMAPHORES')
+if cdata.has('USE_UNNAMED_POSIX_SEMAPHORES') or cdata.has('USE_NAMED_POSIX_SEMAPHORES') or cdata.has('USE_FUTEX_SEMAPHORES')
   backend_sources += files('posix_sema.c')
 endif
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 07b2f798abd..19cbf6e74ee 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -265,6 +265,9 @@
 /* Define to 1 if you have the `zstd' library (-lzstd). */
 #undef HAVE_LIBZSTD
 
+/* Define to 1 if you have the <linux/futex.h> header file. */
+#undef HAVE_LINUX_FUTEX_H
+
 /* Define to 1 if you have the <mbarrier.h> header file. */
 #undef HAVE_MBARRIER_H
 
@@ -418,6 +421,9 @@
 /* Define to 1 if you have the <sys/event.h> header file. */
 #undef HAVE_SYS_EVENT_H
 
+/* Define to 1 if you have the <sys/futex.h> header file. */
+#undef HAVE_SYS_FUTEX_H
+
 /* Define to 1 if you have the <sys/personality.h> header file. */
 #undef HAVE_SYS_PERSONALITY_H
 
@@ -439,6 +445,9 @@
 /* Define to 1 if you have the <sys/ucred.h> header file. */
 #undef HAVE_SYS_UCRED_H
 
+/* Define to 1 if you have the <sys/umtx.h> header file. */
+#undef HAVE_SYS_UMTX_H
+
 /* Define to 1 if you have the <termios.h> header file. */
 #undef HAVE_TERMIOS_H
 
@@ -448,6 +457,9 @@
 /* Define to 1 if you have the <ucred.h> header file. */
 #undef HAVE_UCRED_H
 
+/* Define to 1 if you have the `umtx_sleep' function. */
+#undef HAVE_UMTX_SLEEP
+
 /* Define to 1 if the system has the type `union semun'. */
 #undef HAVE_UNION_SEMUN
 
@@ -538,6 +550,9 @@
 /* Define to 1 if your compiler understands _Static_assert. */
 #undef HAVE__STATIC_ASSERT
 
+/* Define to 1 if you have the `__ulock_wait' function. */
+#undef HAVE___ULOCK_WAIT
+
 /* Define as the maximum alignment requirement of any C data type. */
 #undef MAXIMUM_ALIGNOF
 
diff --git a/src/include/port/pg_futex.h b/src/include/port/pg_futex.h
new file mode 100644
index 00000000000..e5ae05d1d5a
--- /dev/null
+++ b/src/include/port/pg_futex.h
@@ -0,0 +1,171 @@
+/*
+ * Minimal wrapper over futex APIs.
+ */
+
+#ifndef PG_FUTEX_H
+#define PG_FUTEX_H
+
+#if defined(HAVE_LINUX_FUTEX_H)
+
+/* https://man7.org/linux/man-pages/man2/futex.2.html */
+
+#include <linux/futex.h>
+#include <sys/syscall.h>
+
+#elif defined(HAVE_SYS_FUTEX_H)
+
+/* https://man.openbsd.org/futex, since OpenBSD 6.2. */
+
+#include <sys/time.h>
+#include <sys/futex.h>
+
+#elif defined(HAVE_SYS_UMTX_H)
+
+/* https://www.freebsd.org/cgi/man.cgi?query=_umtx_op */
+
+#include <sys/types.h>
+#include <sys/umtx.h>
+
+#elif defined(HAVE_UMTX_SLEEP)
+
+/* https://man.dragonflybsd.org/?command=umtx&section=2 */
+
+#include <unistd.h>
+
+#elif defined(HAVE___ULOCK_WAIT)
+
+/*
+ * This interface is undocumented, but provided by libSystem.dylib since
+ * xnu-3789.1.32 (macOS 10.12, 2016) and is used by eg libc++.
+ *
+ * https://github.com/apple/darwin-xnu/blob/main/bsd/kern/sys_ulock.c
+ * https://github.com/apple/darwin-xnu/blob/main/bsd/sys/ulock.h
+ */
+
+#include <stdint.h>
+
+#define UL_COMPARE_AND_WAIT_SHARED		3
+#define ULF_WAKE_ALL					0x00000100
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+extern int	__ulock_wait(uint32_t operation,
+						 void *addr,
+						 uint64_t value,
+						 uint32_t timeout);
+extern int	__ulock_wake(uint32_t operation,
+						 void *addr,
+						 uint64_t wake_value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/*
+ * Wait for someone to call pg_futex_wake() for the same address, with an
+ * initial check that the value pointed to by 'fut' matches 'value' and an
+ * optional timeout.  Returns 0 when woken, and otherwise -1, with errno set to
+ * EAGAIN if the initial value check fails, and otherwise errors including
+ * EINTR, ETIMEDOUT and EFAULT.
+ */
+static int
+pg_futex_wait_u32(volatile void *fut,
+				  uint32 value,
+				  struct timespec *timeout)
+{
+#if defined(HAVE_LINUX_FUTEX_H)
+	if (syscall(SYS_futex, fut, FUTEX_WAIT, value, timeout, 0, 0) == 0)
+		return 0;
+#elif defined(HAVE_SYS_FUTEX_H)
+	if ((errno = futex((void *) fut, FUTEX_WAIT, (int) value, timeout, NULL)) == 0)
+		return 0;
+	if (errno == ECANCELED)
+		errno = EINTR;
+#elif defined(HAVE_SYS_UMTX_H)
+	if (_umtx_op((void *) fut, UMTX_OP_WAIT_UINT, value, 0, timeout) == 0)
+		return 0;
+#elif defined(HAVE_UMTX_SLEEP)
+	if (umtx_sleep((volatile const int *) fut,
+				   (int) value,
+				   timeout ? timeout->tv_sec * 1000000 + timeout->tv_nsec / 1000 : 0) == 0)
+		return 0;
+	if (errno == EBUSY)
+		errno = EAGAIN;
+#elif defined (HAVE___ULOCK_WAIT)
+	if (__ulock_wait(UL_COMPARE_AND_WAIT_SHARED,
+					 (void *) fut,
+					 value,
+					 timeout ? timeout->tv_sec * 1000000 + timeout->tv_nsec / 1000 : 0) >= 0)
+		return 0;
+#else
+	/*
+	 * If we wanted to simulate futexes on systems that don't have them, here
+	 * we could add a link from our PGPROC struct to a shared memory hash
+	 * table using "fut" (ie address) as the key, then compare *fut == value.
+	 * If false, remove link and fail with EAGAIN.  If true, sleep on proc
+	 * latch.  This wouldn't work for DSM segments; for those, we could search
+	 * for matching DSM segment mappings in this process, and convert the key
+	 * to { segment ID, offset }, just like kernels do internally to make
+	 * inter-process futexes work on shared memory, but... ugh.
+	 */
+	errno = ENOSYS;
+#endif
+
+	Assert(errno != 0);
+
+	return -1;
+}
+
+/*
+ * Wake up to nwaiters waiters that currently wait on the same address as
+ * 'fut'.  Returns 0 on success, and -1 on failure, with errno set.  Though
+ * some of these interfaces can tell us how many were woken, they can't all do
+ * that, so we'll hide that information.
+ */
+static int
+pg_futex_wake(volatile void *fut, int nwaiters)
+{
+#if defined(HAVE_LINUX_FUTEX_H)
+	if (syscall(SYS_futex, fut, FUTEX_WAKE, nwaiters, NULL, 0, 0) >= 0)
+		return 0;
+#elif defined(HAVE_SYS_FUTEX_H)
+	if (futex(fut, FUTEX_WAKE, nwaiters, NULL, NULL) >= 0)
+		return 0;
+#elif defined(HAVE_SYS_UMTX_H)
+	if (_umtx_op((void *) fut, UMTX_OP_WAKE, nwaiters, 0, 0) == 0)
+		return 0;
+#elif defined(HAVE_UMTX_SLEEP)
+	if (umtx_wakeup((volatile const int *) fut, nwaiters) == 0)
+		return 0;
+#elif defined (HAVE___ULOCK_WAIT)
+	if (__ulock_wake(UL_COMPARE_AND_WAIT_SHARED | (nwaiters > 1 ? ULF_WAKE_ALL : 0),
+					 (void *) fut,
+					 0) >= 0)
+		return 0;
+	if (errno == ENOENT)
+		return 0;
+#else
+	/* No implementation available. */
+	errno = ENOSYS;
+#endif
+
+	Assert(errno != 0);
+
+	return -1;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif							/* PG_FUTEX_H */
-- 
2.47.1

From 4614b62e6006f202a3a739175b73684e51c58914 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sun, 24 Oct 2021 21:48:26 +1300
Subject: [PATCH 2/3] Add futex-based semaphore replacement.

Provide a drop-in replacement for POSIX unnamed semaphores using
futexes.  This is useful for systems that don't have unnamed semaphores
at all, or don't have unnamed semaphores that work inter-process.  This
should be more convenient because the alternatives require kernel
resources and configuration and can also leak in various scenarios.
---
 configure                     |  16 +++++-
 configure.ac                  |  16 +++++-
 src/backend/port/posix_sema.c | 100 +++++++++++++++++++++++++++++++++-
 src/include/pg_config.h.in    |   3 +
 4 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/configure b/configure
index 6eb25178dab..acbd7cecaac 100755
--- a/configure
+++ b/configure
@@ -17632,6 +17632,10 @@ if test "$ac_res" != no; then :
 fi
 
   fi
+  if test x"$PREFERRED_SEMAPHORES" = x"FUTEX" ; then
+    # Need futex implementation for this
+    USE_FUTEX_SEMAPHORES=1
+  fi
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking which semaphore API to use" >&5
 $as_echo_n "checking which semaphore API to use... " >&6; }
   if test x"$USE_NAMED_POSIX_SEMAPHORES" = x"1" ; then
@@ -17648,11 +17652,19 @@ $as_echo "#define USE_UNNAMED_POSIX_SEMAPHORES 1" >>confdefs.h
       SEMA_IMPLEMENTATION="src/backend/port/posix_sema.c"
       sematype="unnamed POSIX"
     else
+      if test x"$USE_FUTEX_SEMAPHORES" = x"1" ; then
+
+$as_echo "#define USE_FUTEX_SEMAPHORES 1" >>confdefs.h
+
+        SEMA_IMPLEMENTATION="src/backend/port/posix_sema.c"
+        sematype="futex"
+      else
 
 $as_echo "#define USE_SYSV_SEMAPHORES 1" >>confdefs.h
 
-      SEMA_IMPLEMENTATION="src/backend/port/sysv_sema.c"
-      sematype="System V"
+        SEMA_IMPLEMENTATION="src/backend/port/sysv_sema.c"
+        sematype="System V"
+      fi
     fi
   fi
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: $sematype" >&5
diff --git a/configure.ac b/configure.ac
index 6b4f3e0f2e5..9001c85a74e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2164,6 +2164,10 @@ if test "$PORTNAME" != "win32"; then
     # Need sem_init for this
     AC_SEARCH_LIBS(sem_init, [rt pthread], [USE_UNNAMED_POSIX_SEMAPHORES=1])
   fi
+  if test x"$PREFERRED_SEMAPHORES" = x"FUTEX" ; then
+    # Need futex implementation for this
+    USE_FUTEX_SEMAPHORES=1
+  fi
   AC_MSG_CHECKING([which semaphore API to use])
   if test x"$USE_NAMED_POSIX_SEMAPHORES" = x"1" ; then
     AC_DEFINE(USE_NAMED_POSIX_SEMAPHORES, 1, [Define to select named POSIX semaphores.])
@@ -2175,9 +2179,15 @@ if test "$PORTNAME" != "win32"; then
       SEMA_IMPLEMENTATION="src/backend/port/posix_sema.c"
       sematype="unnamed POSIX"
     else
-      AC_DEFINE(USE_SYSV_SEMAPHORES, 1, [Define to select SysV-style semaphores.])
-      SEMA_IMPLEMENTATION="src/backend/port/sysv_sema.c"
-      sematype="System V"
+      if test x"$USE_FUTEX_SEMAPHORES" = x"1" ; then
+        AC_DEFINE(USE_FUTEX_SEMAPHORES, 1, [Define to select futex semaphores.])
+        SEMA_IMPLEMENTATION="src/backend/port/posix_sema.c"
+        sematype="futex"
+      else
+        AC_DEFINE(USE_SYSV_SEMAPHORES, 1, [Define to select SysV-style semaphores.])
+        SEMA_IMPLEMENTATION="src/backend/port/sysv_sema.c"
+        sematype="System V"
+      fi
     fi
   fi
   AC_MSG_RESULT([$sematype])
diff --git a/src/backend/port/posix_sema.c b/src/backend/port/posix_sema.c
index 64186ec0a7e..88feec98d40 100644
--- a/src/backend/port/posix_sema.c
+++ b/src/backend/port/posix_sema.c
@@ -36,6 +36,10 @@
 #include "storage/pg_sema.h"
 #include "storage/shmem.h"
 
+#if defined(USE_FUTEX_SEMAPHORES)
+#include "port/atomics.h"
+#include "port/pg_futex.h"
+#endif
 
 /* see file header comment */
 #if defined(USE_NAMED_POSIX_SEMAPHORES) && defined(EXEC_BACKEND)
@@ -45,6 +49,9 @@
 typedef union SemTPadded
 {
 	sem_t		pgsem;
+#if defined(USE_FUTEX_SEMAPHORES)
+	pg_atomic_uint32 futexsem;
+#endif
 	char		pad[PG_CACHE_LINE_SIZE];
 } SemTPadded;
 
@@ -70,6 +77,72 @@ static int	nextSemKey;			/* next name to try */
 
 static void ReleaseSemaphores(int status, Datum arg);
 
+#ifdef USE_FUTEX_SEMAPHORES
+
+/*
+ * An implementation of POSIX unnamed semaphores in shared memory, for OSes
+ * that lack them but have futexes.
+ */
+
+/*
+ * Like standard sem_init() with pshared set to 1, meaning that it can work in
+ * shared memory.
+ */
+static void
+pg_futex_sem_init(pg_atomic_uint32 *fut, uint32 value)
+{
+	pg_atomic_init_u32(fut, value);
+}
+
+/*
+ * Like standard sem_post().
+ */
+static int
+pg_futex_sem_post(pg_atomic_uint32 *fut)
+{
+	pg_atomic_fetch_add_u32(fut, 1);
+
+	/*
+	 * XXX If some bits held a waiter count, then the result of the above could
+	 * be checked to see if we can skip this call.  Currently we use semaphores
+	 * as the slow path for lwlocks, so there is always expected to be a
+	 * waiter.
+	 */
+	return pg_futex_wake(fut, INT_MAX);
+}
+
+/*
+ * Like standard sem_wait().
+ */
+static int
+pg_futex_sem_wait(pg_atomic_uint32 *fut)
+{
+	uint32		value = 1;
+
+	/*
+	 * The futex API takes void *, so there is no type checking or casting.
+	 * Assert that pg_atomic_uint32 is really just a wrapped uint32_t as
+	 * required by the kernel for 32 bit futex pre-check.
+	 */
+	StaticAssertStmt(sizeof(*fut) == sizeof(uint32), "unexpected size");
+
+	while (!pg_atomic_compare_exchange_u32(fut, &value, value - 1))
+	{
+		if (value == 0)
+		{
+			/* Wait for someone else to move it above 0. */
+			if (pg_futex_wait_u32(fut, 0, NULL) < 0)
+			{
+				if (errno != EAGAIN)
+					return -1;
+				/* The value changed under our feet.  Try again. */
+			}
+		}
+	}
+	return 0;
+}
+
+#endif
 
 #ifdef USE_NAMED_POSIX_SEMAPHORES
 
@@ -124,7 +197,7 @@ PosixSemaphoreCreate(void)
 
 	return mySem;
 }
-#else							/* !USE_NAMED_POSIX_SEMAPHORES */
+#elif defined(USE_UNNAMED_POSIX_SEMAPHORES)
 
 /*
  * PosixSemaphoreCreate
@@ -139,6 +212,7 @@ PosixSemaphoreCreate(sem_t *sem)
 }
 #endif							/* USE_NAMED_POSIX_SEMAPHORES */
 
+#ifndef USE_FUTEX_SEMAPHORES
 
 /*
  * PosixSemaphoreKill	- removes a semaphore
@@ -156,6 +230,7 @@ PosixSemaphoreKill(sem_t *sem)
 		elog(LOG, "sem_destroy failed: %m");
 #endif
 }
+#endif
 
 
 /*
@@ -238,18 +313,22 @@ PGReserveSemaphores(int maxSemas)
 static void
 ReleaseSemaphores(int status, Datum arg)
 {
+#ifdef USE_NAMED_POSIX_SEMAPHORES
 	int			i;
 
-#ifdef USE_NAMED_POSIX_SEMAPHORES
 	for (i = 0; i < numSems; i++)
 		PosixSemaphoreKill(mySemPointers[i]);
 	free(mySemPointers);
 #endif
 
 #ifdef USE_UNNAMED_POSIX_SEMAPHORES
+	int			i;
+
 	for (i = 0; i < numSems; i++)
 		PosixSemaphoreKill(PG_SEM_REF(sharedSemas + i));
 #endif
+
+	/* Futex-based semaphores have no kernel resource to clean up. */
 }
 
 /*
@@ -261,7 +340,9 @@ PGSemaphore
 PGSemaphoreCreate(void)
 {
 	PGSemaphore sema;
+#ifndef USE_FUTEX_SEMAPHORES
 	sem_t	   *newsem;
+#endif
 
 	/* Can't do this in a backend, because static state is postmaster's */
 	Assert(!IsUnderPostmaster);
@@ -274,6 +355,9 @@ PGSemaphoreCreate(void)
 	/* Remember new sema for ReleaseSemaphores */
 	mySemPointers[numSems] = newsem;
 	sema = (PGSemaphore) newsem;
+#elif defined(USE_FUTEX_SEMAPHORES)
+	sema = &sharedSemas[numSems];
+	pg_futex_sem_init(&sema->sem_padded.futexsem, 1);
 #else
 	sema = &sharedSemas[numSems];
 	newsem = PG_SEM_REF(sema);
@@ -293,6 +377,9 @@ PGSemaphoreCreate(void)
 void
 PGSemaphoreReset(PGSemaphore sema)
 {
+#ifdef USE_FUTEX_SEMAPHORES
+	pg_atomic_write_u32(&sema->sem_padded.futexsem, 0);
+#else
 	/*
 	 * There's no direct API for this in POSIX, so we have to ratchet the
 	 * semaphore down to 0 with repeated trywait's.
@@ -308,6 +395,7 @@ PGSemaphoreReset(PGSemaphore sema)
 			elog(FATAL, "sem_trywait failed: %m");
 		}
 	}
+#endif
 }
 
 /*
@@ -323,7 +411,11 @@ PGSemaphoreLock(PGSemaphore sema)
 	/* See notes in sysv_sema.c's implementation of PGSemaphoreLock. */
 	do
 	{
+#if defined(USE_FUTEX_SEMAPHORES)
+		errStatus = pg_futex_sem_wait(&sema->sem_padded.futexsem);
+#else
 		errStatus = sem_wait(PG_SEM_REF(sema));
+#endif
 	} while (errStatus < 0 && errno == EINTR);
 
 	if (errStatus < 0)
@@ -348,7 +440,11 @@ PGSemaphoreUnlock(PGSemaphore sema)
 	 */
 	do
 	{
+#if defined(USE_FUTEX_SEMAPHORES)
+		errStatus = pg_futex_sem_post(&sema->sem_padded.futexsem);
+#else
 		errStatus = sem_post(PG_SEM_REF(sema));
+#endif
 	} while (errStatus < 0 && errno == EINTR);
 
 	if (errStatus < 0)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 19cbf6e74ee..3fe34e91e1b 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -669,6 +669,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to select futex semaphores. */
+#undef USE_FUTEX_SEMAPHORES
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
-- 
2.47.1

From aca7b842282f2f180ff497072079a9c40556f6fc Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 26 Oct 2023 18:43:26 +1300
Subject: [PATCH 3/3] Use futex-based semaphores on macOS.

---
 meson.build         |  2 ++
 src/template/darwin | 13 +------------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/meson.build b/meson.build
index 5c9775f1a6e..0e713f65a66 100644
--- a/meson.build
+++ b/meson.build
@@ -205,6 +205,8 @@ if host_system == 'cygwin'
   mod_link_with_dir = 'libdir'
 
 elif host_system == 'darwin'
+  sema_kind = 'futex'
+
   dlsuffix = '.dylib'
   library_path_var = 'DYLD_LIBRARY_PATH'
 
diff --git a/src/template/darwin b/src/template/darwin
index e8eb9390687..d3c78805401 100644
--- a/src/template/darwin
+++ b/src/template/darwin
@@ -14,17 +14,6 @@ fi
 # Extra CFLAGS for code that will go into a shared library
 CFLAGS_SL=""
 
-# Select appropriate semaphore support.  Darwin 6.0 (macOS 10.2) and up
-# support System V semaphores; before that we have to use named POSIX
-# semaphores, which are less good for our purposes because they eat a
-# file descriptor per backend per max_connection slot.
-case $host_os in
-  darwin[015].*)
-    USE_NAMED_POSIX_SEMAPHORES=1
-    ;;
-  *)
-    USE_SYSV_SEMAPHORES=1
-    ;;
-esac
+USE_FUTEX_SEMAPHORES=1
 
 DLSUFFIX=".dylib"
-- 
2.47.1

Reply via email to