Alvaro Herrera wrote: > Robert Haas wrote: > > On Thu, Mar 29, 2018 at 5:51 PM, Andres Freund <and...@anarazel.de> wrote: > > >> > 2018-03-06 13:20:24.391 GMT [14869] ERROR: epoll_ctl() failed: Bad > > >> > file > > >> > descriptor > > >> > > >> I can confirm this bug exists in single-user mode. > > > > > > I'm not sure we need to do anything about this, personally. This seems > > > like a fairly rare thing to do in a mode that's definitely not intended > > > to be general purpose. > > > > Mmmph. I don't really think it's possible to view a user-visible > > EBADF as anything other than a coding error. > > IMO the problem is not the user-visible EBADF -- the problem is that the > user might be attempting to clean up from some previous mistake by > removing a replication slot. Using single-user mode might be a strange > tool, but it's not completely unreasonable. Is it really all that > difficult to fix slot acquisition for it?
Here's a patch (against pg10) to fix this problem. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From 59eb9253797776f21267d35b608582b5fd3ff67c Mon Sep 17 00:00:00 2001 From: Alvaro Herrera <alvhe...@alvh.no-ip.org> Date: Tue, 8 May 2018 11:48:56 -0300 Subject: [PATCH 1/2] don't try cond variables if single user mode --- src/backend/replication/slot.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index a8a16f55e9..a43f402214 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -351,20 +351,28 @@ retry: if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) { /* - * This is the slot we want. We don't know yet if it's active, so - * get ready to sleep on it in case it is. (We may end up not - * sleeping, but we don't want to do this while holding the - * spinlock.) + * This is the slot we want; check if it's active under some other + * process. In single user mode, we don't need this check. */ - ConditionVariablePrepareToSleep(&s->active_cv); + if (IsUnderPostmaster) + { + /* + * Get ready to sleep on it in case it is active. (We may end + * up not sleeping, but we don't want to do this while holding + * the spinlock.) + */ + ConditionVariablePrepareToSleep(&s->active_cv); - SpinLockAcquire(&s->mutex); + SpinLockAcquire(&s->mutex); - active_pid = s->active_pid; - if (active_pid == 0) - active_pid = s->active_pid = MyProcPid; + active_pid = s->active_pid; + if (active_pid == 0) + active_pid = s->active_pid = MyProcPid; - SpinLockRelease(&s->mutex); + SpinLockRelease(&s->mutex); + } + else + active_pid = MyProcPid; slot = s; break; -- 2.11.0
>From 008ec88995b2ae1d851033222bfbd8dfebd7a368 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera <alvhe...@alvh.no-ip.org> Date: Tue, 8 May 2018 11:49:08 -0300 Subject: [PATCH 2/2] simplify baroque coding --- src/backend/replication/slot.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index a43f402214..664bf2e3ad 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -99,7 +99,6 @@ ReplicationSlot *MyReplicationSlot = NULL; int max_replication_slots = 0; /* the maximum number of replication * slots */ -static void ReplicationSlotDropAcquired(void); static void ReplicationSlotDropPtr(ReplicationSlot *slot); /* internal persistency functions */ @@ -434,7 +433,8 @@ ReplicationSlotRelease(void) * fail, all that may happen is an incomplete cleanup of the on-disk * data. */ - ReplicationSlotDropAcquired(); + ReplicationSlotDropPtr(MyReplicationSlot); + MyReplicationSlot = NULL; } /* @@ -520,23 +520,10 @@ ReplicationSlotDrop(const char *name, bool nowait) ReplicationSlotAcquire(name, nowait); - ReplicationSlotDropAcquired(); -} + ReplicationSlotDropPtr(MyReplicationSlot); -/* - * Permanently drop the currently acquired replication slot. - */ -static void -ReplicationSlotDropAcquired(void) -{ - ReplicationSlot *slot = MyReplicationSlot; - - Assert(MyReplicationSlot != NULL); - - /* slot isn't acquired anymore */ + /* no longer my slot */ MyReplicationSlot = NULL; - - ReplicationSlotDropPtr(slot); } /* @@ -923,7 +910,7 @@ restart: if (s->data.database != dboid) continue; - /* acquire slot, so ReplicationSlotDropAcquired can be reused */ + /* acquire slot */ SpinLockAcquire(&s->mutex); /* can't change while ReplicationSlotControlLock is held */ slotname = NameStr(s->data.name); @@ -949,16 +936,17 @@ restart: slotname, active_pid))); /* - * To avoid duplicating ReplicationSlotDropAcquired() and to avoid - * holding ReplicationSlotControlLock over filesystem operations, - * release ReplicationSlotControlLock and use - * ReplicationSlotDropAcquired. + * To avoid holding ReplicationSlotControlLock over filesystem + * operations, release ReplicationSlotControlLock while we drop + * the slot. * * As that means the set of slots could change, restart scan from the * beginning each time we release the lock. */ LWLockRelease(ReplicationSlotControlLock); - ReplicationSlotDropAcquired(); + ReplicationSlotDropPtr(MyReplicationSlot); + /* no longer my slot */ + MyReplicationSlot = NULL; goto restart; } LWLockRelease(ReplicationSlotControlLock); -- 2.11.0