On Mon, May 16, 2022 at 3:45 PM Japin Li <japi...@hotmail.com> wrote: > Maybe use the __illumos__ macro more accurity. > > +#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \ > + !defined(__sun__)
Thanks, updated, and with a new commit message. I don't know much about these OSes (though I used lots of Sun machines during the Jurassic period). I know that there are three distributions of illumos: OmniOS, SmartOS and OpenIndiana, and they share the same kernel and base system. The off-list reports I received about hangs and kernel panics were from OpenIndiana animals hake and haddock, which are not currently reporting (I'll ask why), and then their owner defined -DWAIT_USE_POLL to clear that up while we waited for progress on his kernel panic bug report. I see that OmniOS animal pollock is currently reporting and also uses -DWAIT_USE_POLL, but I couldn't find any discussion about that. Of course, you might be hitting some completely different problem, given the lack of information. I'd be interested in the output of "p *MyLatch" (= to see if the latch has already been set), and whether "kill -URG PID" dislodges the stuck process. But given the open kernel bug report that I've now been reminded of, I'm thinking about pushing this anyway. Then we could ask the animal owners to remove -DWAIT_USE_POLL so that they'd effectively be running with -DWAIT_USE_EPOLL and -DWAIT_USE_SELF_PIPE, which would be more like PostgreSQL 13, but people who want to reproduce the problem on the illumos side could build with -DWAIT_USE_SIGNALFD.
From 9a2dd1ed57c3364c98fe459071e84702015c5814 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sat, 14 May 2022 10:15:30 +1200 Subject: [PATCH v3] Don't trust signalfd() on illumos. Hangs and kernel panics have been reported on illumos systems since we started using signalfd() to receive latch wakeup events. A bug report exists at https://www.illumos.org/issues/13700 but no fix is available yet. Provide a way to go back to using a self-pipe with -DWAIT_USE_SELF_PIPE, and make that the default on that platform. Users can explicitly provide -DWAIT_USE_SIGNALFD if required in case that's helpful to investigate what's going wrong. Back-patch to 14, where we started using signalfd(). Reported-by: Japin Li <japi...@hotmail.com> Reported-by: Olaf Bohlen <olboh...@eenfach.de> (off-list) Reviewed-by: Japin Li <japi...@hotmail.com> Discussion: https://postgr.es/m/MEYP282MB1669C8D88F0997354C2313C1B6CA9%40MEYP282MB1669.AUSP282.PROD.OUTLOOK.COM --- src/backend/storage/ipc/latch.c | 58 +++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index 78c6a89271..fba8a9ea94 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -72,7 +72,7 @@ #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \ defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32) /* don't overwrite manual choice */ -#elif defined(HAVE_SYS_EPOLL_H) && defined(HAVE_SYS_SIGNALFD_H) +#elif defined(HAVE_SYS_EPOLL_H) #define WAIT_USE_EPOLL #elif defined(HAVE_KQUEUE) #define WAIT_USE_KQUEUE @@ -84,6 +84,22 @@ #error "no wait set implementation available" #endif +/* + * By default, we use a self-pipe with poll() and a signalfd with epoll(), if + * available. We avoid signalfd on illumos for now based on problem reports. + * For testing the choice can also be manually specified. + */ +#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL) +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) +/* don't overwrite manual choice */ +#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \ + !defined(__illumos__) +#define WAIT_USE_SIGNALFD +#else +#define WAIT_USE_SELF_PIPE +#endif +#endif + /* typedef in latch.h */ struct WaitEventSet { @@ -146,12 +162,12 @@ static WaitEventSet *LatchWaitSet; static volatile sig_atomic_t waiting = false; #endif -#ifdef WAIT_USE_EPOLL +#ifdef WAIT_USE_SIGNALFD /* On Linux, we'll receive SIGURG via a signalfd file descriptor. */ static int signal_fd = -1; #endif -#if defined(WAIT_USE_POLL) +#ifdef WAIT_USE_SELF_PIPE /* Read and write ends of the self-pipe */ static int selfpipe_readfd = -1; static int selfpipe_writefd = -1; @@ -164,7 +180,7 @@ static void latch_sigurg_handler(SIGNAL_ARGS); static void sendSelfPipeByte(void); #endif -#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL) +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) static void drain(void); #endif @@ -190,7 +206,7 @@ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, void InitializeLatchSupport(void) { -#if defined(WAIT_USE_POLL) +#if defined(WAIT_USE_SELF_PIPE) int pipefd[2]; if (IsUnderPostmaster) @@ -264,7 +280,7 @@ InitializeLatchSupport(void) pqsignal(SIGURG, latch_sigurg_handler); #endif -#ifdef WAIT_USE_EPOLL +#ifdef WAIT_USE_SIGNALFD sigset_t signalfd_mask; /* Block SIGURG, because we'll receive it through a signalfd. */ @@ -316,7 +332,7 @@ ShutdownLatchSupport(void) LatchWaitSet = NULL; } -#if defined(WAIT_USE_POLL) +#if defined(WAIT_USE_SELF_PIPE) close(selfpipe_readfd); close(selfpipe_writefd); selfpipe_readfd = -1; @@ -324,7 +340,7 @@ ShutdownLatchSupport(void) selfpipe_owner_pid = InvalidPid; #endif -#if defined(WAIT_USE_EPOLL) +#if defined(WAIT_USE_SIGNALFD) close(signal_fd); signal_fd = -1; #endif @@ -341,9 +357,12 @@ InitLatch(Latch *latch) latch->owner_pid = MyProcPid; latch->is_shared = false; -#if defined(WAIT_USE_POLL) +#if defined(WAIT_USE_SELF_PIPE) /* Assert InitializeLatchSupport has been called in this process */ Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid); +#elif defined(WAIT_USE_SIGNALFD) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(signal_fd >= 0); #elif defined(WAIT_USE_WIN32) latch->event = CreateEvent(NULL, TRUE, FALSE, NULL); if (latch->event == NULL) @@ -405,9 +424,12 @@ OwnLatch(Latch *latch) /* Sanity checks */ Assert(latch->is_shared); -#if defined(WAIT_USE_POLL) +#if defined(WAIT_USE_SELF_PIPE) /* Assert InitializeLatchSupport has been called in this process */ Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid); +#elif defined(WAIT_USE_SIGNALFD) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(signal_fd >= 0); #endif if (latch->owner_pid != 0) @@ -617,7 +639,7 @@ SetLatch(Latch *latch) return; else if (owner_pid == MyProcPid) { -#if defined(WAIT_USE_POLL) +#if defined(WAIT_USE_SELF_PIPE) if (waiting) sendSelfPipeByte(); #else @@ -904,9 +926,9 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, { set->latch = latch; set->latch_pos = event->pos; -#if defined(WAIT_USE_POLL) +#if defined(WAIT_USE_SELF_PIPE) event->fd = selfpipe_readfd; -#elif defined(WAIT_USE_EPOLL) +#elif defined(WAIT_USE_SIGNALFD) event->fd = signal_fd; #else event->fd = PGINVALID_SOCKET; @@ -2083,7 +2105,7 @@ GetNumRegisteredWaitEvents(WaitEventSet *set) return set->nevents; } -#if defined(WAIT_USE_POLL) +#if defined(WAIT_USE_SELF_PIPE) /* * SetLatch uses SIGURG to wake up the process waiting on the latch. @@ -2134,7 +2156,7 @@ retry: #endif -#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL) +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) /* * Read all available data from self-pipe or signalfd. @@ -2150,7 +2172,7 @@ drain(void) int rc; int fd; -#ifdef WAIT_USE_POLL +#ifdef WAIT_USE_SELF_PIPE fd = selfpipe_readfd; #else fd = signal_fd; @@ -2168,7 +2190,7 @@ drain(void) else { waiting = false; -#ifdef WAIT_USE_POLL +#ifdef WAIT_USE_SELF_PIPE elog(ERROR, "read() on self-pipe failed: %m"); #else elog(ERROR, "read() on signalfd failed: %m"); @@ -2178,7 +2200,7 @@ drain(void) else if (rc == 0) { waiting = false; -#ifdef WAIT_USE_POLL +#ifdef WAIT_USE_SELF_PIPE elog(ERROR, "unexpected EOF on self-pipe"); #else elog(ERROR, "unexpected EOF on signalfd"); -- 2.36.0