Hi,

I think I did find a bug related to the test, but afaics not the cause of the
test failures we're seeing. See
https://www.postgresql.org/message-id/20220223014855.4lsddr464i7mymk2%40alap3.anarazel.de

I don't think it's related to the problem of this thread, because the logs of
primary3 don't have a single mention of

                ereport(LOG,
                        (errmsg("terminating process %d to release replication 
slot \"%s\"",
                                active_pid, NameStr(slotname))));

On 2022-02-18 15:14:15 -0800, Andres Freund wrote:
> I'm running out of ideas for how to try to reproduce this. I think we might
> need some additional debugging information to get more information from the
> buildfarm.

> I'm thinking of adding log_min_messages=DEBUG2 to primary3, passing --verbose
> to pg_basebackup in $node_primary3->backup(...).
>
> It might also be worth adding DEBUG2 messages to ReplicationSlotShmemExit(),
> ReplicationSlotCleanup(), InvalidateObsoleteReplicationSlots().

Planning to commit something like the attached.

Greetings,

Andres Freund
>From afdeff10526e29e3fc63b18c08100458780489d9 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Tue, 22 Feb 2022 18:02:34 -0800
Subject: [PATCH v1] Add temporary debug info to help debug
 019_replslot_limit.pl failures.

I have not been able to reproduce the occasional failures of
019_replslot_limit.pl we are seeing in the buildfarm and not for lack of
trying. The additional logging and increased log level will hopefully help.

Will be reverted once the cause is identified.

Discussion: https://postgr.es/m/20220218231415.c4plkp4i3reqc...@alap3.anarazel.de
---
 src/backend/replication/slot.c            | 21 +++++++++++++++++++++
 src/bin/pg_basebackup/pg_basebackup.c     | 10 +++++++++-
 src/test/recovery/t/019_replslot_limit.pl |  5 ++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 5da5fa825a2..3d39fddaaef 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -177,6 +177,10 @@ ReplicationSlotInitialize(void)
 static void
 ReplicationSlotShmemExit(int code, Datum arg)
 {
+	/* temp debugging aid to analyze 019_replslot_limit failures */
+	elog(DEBUG3, "replication slot exit hook, %s active slot",
+		 MyReplicationSlot != NULL ? "with" : "without");
+
 	/* Make sure active replication slots are released */
 	if (MyReplicationSlot != NULL)
 		ReplicationSlotRelease();
@@ -554,6 +558,9 @@ ReplicationSlotCleanup(void)
 	Assert(MyReplicationSlot == NULL);
 
 restart:
+	/* temp debugging aid to analyze 019_replslot_limit failures */
+	elog(DEBUG3, "temporary replication slot cleanup: begin");
+
 	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 	for (i = 0; i < max_replication_slots; i++)
 	{
@@ -579,6 +586,8 @@ restart:
 	}
 
 	LWLockRelease(ReplicationSlotControlLock);
+
+	elog(DEBUG3, "temporary replication slot cleanup: done");
 }
 
 /*
@@ -1284,6 +1293,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
 				(void) kill(active_pid, SIGTERM);
 				last_signaled_pid = active_pid;
 			}
+			else
+			{
+				/* temp debugging aid to analyze 019_replslot_limit failures */
+				elog(DEBUG3, "not signalling process %d during invalidation of slot \"%s\"",
+					 active_pid, NameStr(slotname));
+			}
 
 			/* Wait until the slot is released. */
 			ConditionVariableSleep(&s->active_cv,
@@ -1347,6 +1362,10 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
 	XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
 
 restart:
+	/* temp debugging aid to analyze 019_replslot_limit failures */
+	elog(DEBUG3, "begin invalidating obsolete replication slots older than %X/%X",
+		 LSN_FORMAT_ARGS(oldestLSN));
+
 	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 	for (int i = 0; i < max_replication_slots; i++)
 	{
@@ -1372,6 +1391,8 @@ restart:
 		ReplicationSlotsComputeRequiredLSN();
 	}
 
+	elog(DEBUG3, "done invalidating obsolete replication slots");
+
 	return invalidated;
 }
 
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 08b07d5a06e..8c77c533e64 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -700,8 +700,16 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier)
 	bgchild = fork();
 	if (bgchild == 0)
 	{
+		int			ret;
+
 		/* in child process */
-		exit(LogStreamerMain(param));
+		ret = LogStreamerMain(param);
+
+		/* temp debugging aid to analyze 019_replslot_limit failures */
+		if (verbose)
+			pg_log_info("log streamer with pid %d exiting", getpid());
+
+		exit(ret);
 	}
 	else if (bgchild < 0)
 	{
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index 4257bd4d35a..0c9da9bf272 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -316,13 +316,16 @@ $node_primary3->append_conf(
 	max_wal_size = 2MB
 	log_checkpoints = yes
 	max_slot_wal_keep_size = 1MB
+
+	# temp debugging aid to analyze 019_replslot_limit failures
+	log_min_messages=debug3
 	));
 $node_primary3->start;
 $node_primary3->safe_psql('postgres',
 	"SELECT pg_create_physical_replication_slot('rep3')");
 # Take backup
 $backup_name = 'my_backup';
-$node_primary3->backup($backup_name);
+$node_primary3->backup($backup_name, backup_options => ['--verbose']);
 # Create standby
 my $node_standby3 = PostgreSQL::Test::Cluster->new('standby_3');
 $node_standby3->init_from_backup($node_primary3, $backup_name,
-- 
2.34.0

Reply via email to