On Thu, Apr 28, 2022 at 03:49:42PM +0900, Michael Paquier wrote:
> I am not sure what you mean here.  FWIW, I am translating the
> suggestion of Nathan to split the existing check in
> CreateRestartPoint() that we are discussing here into two if blocks,
> instead of just one:
> - Move the update of checkPoint and checkPointCopy into its own if
> block, controlled only by the check on
> (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
> - Keep the code updating minRecoveryPoint and minRecoveryPointTLI
> mostly as-is, but just do the update when the control file state is
> DB_IN_ARCHIVE_RECOVERY.  Of course we need to keep the check on
> (minRecoveryPoint < lastCheckPointEndPtr).

And I have spent a bit of this stuff to finish with the attached.  It
will be a plus to get that done on HEAD for beta1, so I'll try to deal
with it on Monday.  I am still a bit stressed about the back branches
as concurrent checkpoints are possible via CreateCheckPoint() from the
startup process (not the case of HEAD), but the stable branches will
have a new point release very soon so let's revisit this choice there
later.  v6 attached includes a TAP test, but I don't intend to include
it as it is expensive.
--
Michael
From ffc1469819b28ff76e3290cd695e60511d94305a Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Fri, 6 May 2022 19:57:54 +0900
Subject: [PATCH v6] Correctly update control file at promotion

If the cluster ends recovery on a promotion (aka the control file shows
a state different than DB_IN_ARCHIVE_RECOVERY) while
CreateRestartPoint() is still processing, this function could miss the
update the control file but still do the recycling/removal of the past
WAL segments, causing a follow-up restart doing recovery to fail on a
missing checkpoint record (aka PANIC) because the redo LSN referred in
the control file was located in a segment already gone.

This commit fixes the control file update so the redo LSN is updated to
reflect what gets recycled, with the minimum recovery point changed only
if the cluster is still doing archive recovery.
---
 src/backend/access/transam/xlog.c             |  48 ++++----
 .../t/027_invalid_checkpoint_after_promote.pl | 113 ++++++++++++++++++
 2 files changed, 141 insertions(+), 20 deletions(-)
 create mode 100644 src/test/recovery/t/027_invalid_checkpoint_after_promote.pl

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 61cda56c6f..c9c0b3f90a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6921,6 +6921,9 @@ CreateRestartPoint(int flags)
 	XLogSegNo	_logSegNo;
 	TimestampTz xtime;
 
+	/* Concurrent checkpoint/restartpoint cannot happen */
+	Assert (!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
+
 	/* Get a local copy of the last safe checkpoint record. */
 	SpinLockAcquire(&XLogCtl->info_lck);
 	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
@@ -7013,30 +7016,33 @@ CreateRestartPoint(int flags)
 	 */
 	PriorRedoPtr = ControlFile->checkPointCopy.redo;
 
-	/*
-	 * Update pg_control, using current time.  Check that it still shows
-	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
-	 * this is a quick hack to make sure nothing really bad happens if somehow
-	 * we get here after the end-of-recovery checkpoint.
-	 */
+	/* Update pg_control, using current time */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
-		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
+
+	/*
+	 * Check if the control file still shows an older checkpoint, and update
+	 * its information.
+	 */
+	if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
 	{
 		ControlFile->checkPoint = lastCheckPointRecPtr;
 		ControlFile->checkPointCopy = lastCheckPoint;
+	}
 
-		/*
-		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
-		 * this will have happened already while writing out dirty buffers,
-		 * but not necessarily - e.g. because no buffers were dirtied.  We do
-		 * this because a backup performed in recovery uses minRecoveryPoint to
-		 * determine which WAL files must be included in the backup, and the
-		 * file (or files) containing the checkpoint record must be included,
-		 * at a minimum. Note that for an ordinary restart of recovery there's
-		 * no value in having the minimum recovery point any earlier than this
-		 * anyway, because redo will begin just after the checkpoint record.
-		 */
+	/*
+	 * Ensure minRecoveryPoint is past the checkpoint record while archive
+	 * recovery is still ongoing.  Normally, this will have happened already
+	 * while writing out dirty buffers, but not necessarily - e.g. because no
+	 * buffers were dirtied.  We do this because a base backup uses
+	 * minRecoveryPoint to determine which WAL files must be included in the
+	 * backup, and the file (or files) containing the checkpoint record must
+	 * be included, at a minimum.  Note that for an ordinary restart of
+	 * recovery there's no value in having the minimum recovery point any
+	 * earlier than this anyway, because redo will begin just after the
+	 * checkpoint record.
+	 */
+	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
+	{
 		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
 		{
 			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
@@ -7046,10 +7052,12 @@ CreateRestartPoint(int flags)
 			LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
 			LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 		}
+
 		if (flags & CHECKPOINT_IS_SHUTDOWN)
 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
-		UpdateControlFile();
 	}
+
+	UpdateControlFile();
 	LWLockRelease(ControlFileLock);
 
 	/*
diff --git a/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl b/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl
new file mode 100644
index 0000000000..15c8d0a01a
--- /dev/null
+++ b/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl
@@ -0,0 +1,113 @@
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Time::HiRes qw(usleep nanosleep);
+use Test::More tests => 5;
+
+# initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('master');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf(
+	'postgresql.conf', q[
+checkpoint_timeout = 30s
+max_wal_size = 16GB
+log_checkpoints = on
+restart_after_crash = on
+]);
+$node_primary->start;
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# setup a standby
+my $node_standby = PostgreSQL::Test::Cluster->new('standby1');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1);
+$node_standby->start;
+
+# dummy table for the upcoming tests.
+$node_primary->safe_psql('postgres', 'checkpoint');
+$node_primary->safe_psql('postgres', 'create table test (a int, b varchar(255))');
+
+# use background psql to insert batch, just to make checkpoint a little slow.
+my $psql_timeout = IPC::Run::timer(3600);
+my ($stdin, $stdout, $stderr) = ('', '', '');
+my $psql_primary = IPC::Run::start(
+	[ 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', $node_primary->connstr('postgres') ],
+	'<',
+	\$stdin,
+	'>',
+	\$stdout,
+	'2>',
+	\$stderr,
+	$psql_timeout);
+$stdin .= q[
+INSERT INTO test SELECT i, 'aaaaaaaaaaaaaaaaaaaaaaa' from generate_series(1, 100000000) as i;
+];
+$psql_primary->pump_nb();
+
+# wait until restartpoint on standy
+my $logstart = -s $node_standby->logfile;
+my $checkpoint_start = 0;
+for (my $i = 0; $i < 3000; $i++)
+{
+    my $log = slurp_file($node_standby->logfile, $logstart);
+	if ($log =~ m/restartpoint starting: time/)
+	{
+        $checkpoint_start = 1;
+		last;
+	}
+	usleep(100_000);
+}
+is($checkpoint_start, 1, 'restartpoint has started');
+
+# promote during restartpoint
+$node_primary->stop;
+$node_standby->promote;
+
+# wait until checkpoint on new primary
+$logstart = -s $node_standby->logfile;
+$checkpoint_start = 0;
+for (my $i = 0; $i < 3000; $i++)
+{
+    my $log = slurp_file($node_standby->logfile, $logstart);
+	if ($log =~ m/restartpoint complete/)
+	{
+        $checkpoint_start = 1;
+		last;
+	}
+	usleep(100_000);
+}
+is($checkpoint_start, 1, 'checkpoint has started');
+
+# kill SIGKILL a backend, and all backend will restart. Note that previous checkpoint has not completed.
+my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
+my $killme = IPC::Run::start(
+	[ 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', $node_standby->connstr('postgres') ],
+	'<',
+	\$killme_stdin,
+	'>',
+	\$killme_stdout,
+	'2>',
+	\$killme_stderr,
+	$psql_timeout);
+$killme_stdin .= q[
+SELECT pg_backend_pid();
+];
+$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/;
+my $pid = $killme_stdout;
+chomp($pid);
+my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid);
+is($ret, 0, 'killed process with KILL');
+
+# after recovery, the server will not start, and log PANIC: could not locate a valid checkpoint record
+for (my $i = 0; $i < 30; $i++)
+{
+    ($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1');
+    last if $ret == 0;
+	sleep(1);
+}
+is($ret, 0, "psql connect success");
+is($stdout, 1, "psql select 1");
+
+$psql_primary->finish;
+$killme->finish;
-- 
2.36.0

Attachment: signature.asc
Description: PGP signature

Reply via email to