Re: Race condition in InvalidateObsoleteReplicationSlots()

Álvaro Herrera Fri, 18 Jun 2021 13:59:24 -0700

On 2021-Jun-11, Álvaro Herrera wrote:

> I tried hard to make this stable, but it just isn't (it works fine one
> thousand runs, then I grab some coffee and run it once more and that one
> fails.  Why?  that's not clear to me).  Attached is the last one I have,
> in case somebody wants to make it better.  Maybe there's some completely
> different approach that works better, but I'm out of ideas for now.


It occurred to me that this could be made better by sigstopping both
walreceiver and walsender, then letting only the latter run; AFAICS this
makes the test stable.  I'll register this on the upcoming commitfest to
let cfbot run it, and if it looks good there I'll get it pushed to
master.  If there's any problem I'll just remove it before beta2 is
stamped.

-- 
Álvaro Herrera       Valdivia, Chile

>From 8080ced6b3c807039ff9a66810fa661fae19b347 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Thu, 10 Jun 2021 16:44:03 -0400
Subject: [PATCH v3] Add test case for obsoleting slot with active walsender
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code to signal a running walsender when its reserved WAL size grows
too large is completely uncovered before this commit; this adds coverage
for that case.

This test involves sending SIGSTOP to walsender and walreceiver and
running a checkpoint while advancing WAL, then sending SIGCONT.  There's
no precedent for this coding in Perl tests, and my reading of relevant
manpages says it's likely to fail on Windows.  Because of this, this
test is always skipped on that platform.

Author: Álvaro Herrera <alvhe...@alvh.no-ip.org>
Discussion: https://postgr.es/m/202106102202.mjw4huiix7lo@alvherre.pgsql
---
 src/test/recovery/t/019_replslot_limit.pl | 82 ++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 3 deletions(-)

diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index 7094aa0704..2e6c5a229e 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -11,7 +11,7 @@ use TestLib;
 use PostgresNode;
 
 use File::Path qw(rmtree);
-use Test::More tests => 14;
+use Test::More tests => $TestLib::windows_os ? 14 : 18;
 use Time::HiRes qw(usleep);
 
 $ENV{PGDATABASE} = 'postgres';
@@ -211,8 +211,8 @@ for (my $i = 0; $i < 10000; $i++)
 }
 ok($failed, 'check that replication has been broken');
 
-$node_primary->stop('immediate');
-$node_standby->stop('immediate');
+$node_primary->stop;
+$node_standby->stop;
 
 my $node_primary2 = get_new_node('primary2');
 $node_primary2->init(allows_streaming => 1);
@@ -253,6 +253,82 @@ my @result =
 		timeout => '60'));
 is($result[1], 'finished', 'check if checkpoint command is not blocked');
 
+$node_primary2->stop;
+$node_standby->stop;
+
+# The next test depends on Perl's `kill`, which apparently is not
+# portable to Windows.  (It would be nice to use Test::More's `subtest`,
+# but that's not in the ancient version we require.)
+if ($TestLib::windows_os)
+{
+	done_testing();
+	exit;
+}
+
+# Get a slot terminated while the walsender is active
+# We do this by sending SIGSTOP to the walsender.  Skip this on Windows.
+my $node_primary3 = get_new_node('primary3');
+$node_primary3->init(allows_streaming => 1, extra => ['--wal-segsize=1']);
+$node_primary3->append_conf(
+	'postgresql.conf', qq(
+	min_wal_size = 2MB
+	max_wal_size = 2MB
+	log_checkpoints = yes
+	max_slot_wal_keep_size = 1MB
+	));
+$node_primary3->start;
+$node_primary3->safe_psql('postgres',
+	"SELECT pg_create_physical_replication_slot('rep3')");
+# Take backup
+$backup_name = 'my_backup';
+$node_primary3->backup($backup_name);
+# Create standby
+my $node_standby3 = get_new_node('standby_3');
+$node_standby3->init_from_backup($node_primary3, $backup_name,
+	has_streaming => 1);
+$node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
+$node_standby3->start;
+$node_primary3->wait_for_catchup($node_standby3->name, 'replay');
+my $senderpid = $node_primary3->safe_psql('postgres',
+	"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
+like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
+my $receiverpid = $node_standby3->safe_psql('postgres',
+	"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
+like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
+
+# freeze walsender and walreceiver. Slot will still be active, but walreceiver
+# won't get anything anymore.
+kill 'STOP', $senderpid, $receiverpid;
+$logstart = get_log_size($node_primary3);
+advance_wal($node_primary3, 4);
+ok(find_in_log($node_primary3, "to release replication slot", $logstart),
+	"walreceiver termination logged");
+
+# Now let the walsender continue; slot should be killed now.
+# (Must not let walreceiver run yet; otherwise the standby could start another
+# one before the slot can be killed)
+kill 'CONT', $senderpid;
+$node_primary3->poll_query_until('postgres',
+	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep3'",
+	"lost")
+  or die "timed out waiting for slot to be lost";
+
+while (1)
+{
+	if (find_in_log($node_primary3,
+			'invalidating slot "rep3" because its restart_lsn', $logstart))
+	{
+		ok(1, "slot invalidation logged");
+		last;
+	}
+}
+
+# Now let the walreceiver continue, so that the node can be stopped cleanly
+kill 'CONT', $receiverpid;
+
+$node_primary3->stop;
+$node_standby3->stop;
+
 #####################################
 # Advance WAL of $node by $n segments
 sub advance_wal
-- 
2.20.1

Re: Race condition in InvalidateObsoleteReplicationSlots()

Reply via email to