Hi,
I think I did find a bug related to the test, but afaics not the cause of the test failures we're seeing. See https://www.postgresql.org/message-id/20220223014855.4lsddr464i7mymk2%40alap3.anarazel.de I don't think it's related to the problem of this thread, because the logs of primary3 don't have a single mention of ereport(LOG, (errmsg("terminating process %d to release replication slot \"%s\"", active_pid, NameStr(slotname)))); On 2022-02-18 15:14:15 -0800, Andres Freund wrote: > I'm running out of ideas for how to try to reproduce this. I think we might > need some additional debugging information to get more information from the > buildfarm. > I'm thinking of adding log_min_messages=DEBUG2 to primary3, passing --verbose > to pg_basebackup in $node_primary3->backup(...). > > It might also be worth adding DEBUG2 messages to ReplicationSlotShmemExit(), > ReplicationSlotCleanup(), InvalidateObsoleteReplicationSlots(). Planning to commit something like the attached. Greetings, Andres Freund
>From afdeff10526e29e3fc63b18c08100458780489d9 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Tue, 22 Feb 2022 18:02:34 -0800 Subject: [PATCH v1] Add temporary debug info to help debug 019_replslot_limit.pl failures. I have not been able to reproduce the occasional failures of 019_replslot_limit.pl we are seeing in the buildfarm and not for lack of trying. The additional logging and increased log level will hopefully help. Will be reverted once the cause is identified. Discussion: https://postgr.es/m/20220218231415.c4plkp4i3reqc...@alap3.anarazel.de --- src/backend/replication/slot.c | 21 +++++++++++++++++++++ src/bin/pg_basebackup/pg_basebackup.c | 10 +++++++++- src/test/recovery/t/019_replslot_limit.pl | 5 ++++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 5da5fa825a2..3d39fddaaef 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -177,6 +177,10 @@ ReplicationSlotInitialize(void) static void ReplicationSlotShmemExit(int code, Datum arg) { + /* temp debugging aid to analyze 019_replslot_limit failures */ + elog(DEBUG3, "replication slot exit hook, %s active slot", + MyReplicationSlot != NULL ? "with" : "without"); + /* Make sure active replication slots are released */ if (MyReplicationSlot != NULL) ReplicationSlotRelease(); @@ -554,6 +558,9 @@ ReplicationSlotCleanup(void) Assert(MyReplicationSlot == NULL); restart: + /* temp debugging aid to analyze 019_replslot_limit failures */ + elog(DEBUG3, "temporary replication slot cleanup: begin"); + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { @@ -579,6 +586,8 @@ restart: } LWLockRelease(ReplicationSlotControlLock); + + elog(DEBUG3, "temporary replication slot cleanup: done"); } /* @@ -1284,6 +1293,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN, (void) kill(active_pid, SIGTERM); last_signaled_pid = active_pid; } + else + { + /* temp debugging aid to analyze 019_replslot_limit failures */ + elog(DEBUG3, "not signalling process %d during invalidation of slot \"%s\"", + active_pid, NameStr(slotname)); + } /* Wait until the slot is released. */ ConditionVariableSleep(&s->active_cv, @@ -1347,6 +1362,10 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno) XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN); restart: + /* temp debugging aid to analyze 019_replslot_limit failures */ + elog(DEBUG3, "begin invalidating obsolete replication slots older than %X/%X", + LSN_FORMAT_ARGS(oldestLSN)); + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (int i = 0; i < max_replication_slots; i++) { @@ -1372,6 +1391,8 @@ restart: ReplicationSlotsComputeRequiredLSN(); } + elog(DEBUG3, "done invalidating obsolete replication slots"); + return invalidated; } diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 08b07d5a06e..8c77c533e64 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -700,8 +700,16 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier) bgchild = fork(); if (bgchild == 0) { + int ret; + /* in child process */ - exit(LogStreamerMain(param)); + ret = LogStreamerMain(param); + + /* temp debugging aid to analyze 019_replslot_limit failures */ + if (verbose) + pg_log_info("log streamer with pid %d exiting", getpid()); + + exit(ret); } else if (bgchild < 0) { diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl index 4257bd4d35a..0c9da9bf272 100644 --- a/src/test/recovery/t/019_replslot_limit.pl +++ b/src/test/recovery/t/019_replslot_limit.pl @@ -316,13 +316,16 @@ $node_primary3->append_conf( max_wal_size = 2MB log_checkpoints = yes max_slot_wal_keep_size = 1MB + + # temp debugging aid to analyze 019_replslot_limit failures + log_min_messages=debug3 )); $node_primary3->start; $node_primary3->safe_psql('postgres', "SELECT pg_create_physical_replication_slot('rep3')"); # Take backup $backup_name = 'my_backup'; -$node_primary3->backup($backup_name); +$node_primary3->backup($backup_name, backup_options => ['--verbose']); # Create standby my $node_standby3 = PostgreSQL::Test::Cluster->new('standby_3'); $node_standby3->init_from_backup($node_primary3, $backup_name, -- 2.34.0