Re: PANIC during crash recovery of a recently promoted standby

Michael Paquier Sun, 13 May 2018 20:39:36 -0700

On Sat, May 12, 2018 at 07:41:33AM +0900, Michael Paquier wrote:
> pg_ctl promote would wait for the control file to be updated, so you
> cannot use it in the TAP tests to trigger the promotion.  Still I think
> I found one after waking up?  Please note I have not tested it:
> - Use a custom trigger file and then trigger promotion with a signal.
> - Use a sleep command in recovery_end_command to increase the window, as
> what matters is sleeping after CreateEndOfRecoveryRecord updates the
> control file.
> - Issue a restart point on the standby, which will update the control
> file.
> - Stop the standby with immediate mode.
> - Start the standby, it should see unreferenced pages.


I have been looking at that this morning, and actually I have been able
to create a test case where minRecoveryPoint goes backwards using
Pavan's patch.  Using a sleep in recovery_end_command has proved to not
be enough so I had to patch the backend with a couple of sleeps and some
processing, mainly:
- One sleep in CreateRestartPoint to make a restart point wait before
updating the control file, which I set at 5s.
- One sleep just after calling CreateEndOfRecoveryRecord, which has been
set at 20s.
- Trigger an asynchronous checkpoint using IPC::Run::start.
- Trigger a promotion with a trigger file and SIGUSR2 to the
postmaster.

The rest of the test is crafted with some magic wait number and adds
some logs to ease the monitoring of the issue.  In order to get a crash,
I think that you would need to crash the backend after creating the last
WAL records which generate the invalid page references, and also slow
down the last restart point which makes minRecoveryPoint go backwards,
which is err...  Complicated to make deterministic.  Still if you apply
the patch attached you would see log entries on the standby as follows:
2018-05-14 12:24:15.065 JST [17352] LOG:  selected new timeline ID: 2
2018-05-14 12:24:15.074 JST [17352] LOG:  archive recovery complete
2018-05-14 12:24:15.074 JST [17352] WARNING:  CreateEndOfRecoveryRecord: 
minRecoveryPoint is 0/32C4258 before update
2018-05-14 12:24:15.074 JST [17352] WARNING:  CreateEndOfRecoveryRecord: 
minRecoveryPoint is 0/0 after update
2018-05-14 12:24:17.067 JST [17353] WARNING:  checkPointCopy.redo =0/30B3D70, 
lastCheckPoint.redo = 0/31BC208
2018-05-14 12:24:17.067 JST [17353] WARNING:  CreateRestartPoint: 
minRecoveryPoint is 0/0 before update, lastCheckPointEndPtr is 0/31BC2B0
2018-05-14 12:24:17.067 JST [17353] WARNING:  CreateRestartPoint: 
minRecoveryPoint is 0/31BC2B0 after update

So minRecoveryRecord can go definitely go backwards here, which is not
good.  Attached is a patch which includes Pavan's fix on top of the test
case I crafted with what is in upthread.  You just need to apply it on
HEAD.
--
Michael

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c633e11128..a4fe01112b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2763,7 +2763,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 			minRecoveryPoint = newMinRecoveryPoint;
 			minRecoveryPointTLI = newMinRecoveryPointTLI;
 
-			ereport(DEBUG2,
+			ereport(WARNING,
 					(errmsg("updated min recovery point to %X/%X on timeline %u",
 							(uint32) (minRecoveryPoint >> 32),
 							(uint32) minRecoveryPoint,
@@ -7631,6 +7631,9 @@ StartupXLOG(void)
 				}
 			}
 
+			/* the sleep should happen before requesting a checkpoint */
+			pg_usleep(1000000L * 20); /* 10s */
+
 			if (!fast_promoted)
 				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
 								  CHECKPOINT_IMMEDIATE |
@@ -7643,9 +7646,12 @@ StartupXLOG(void)
 		 * And finally, execute the recovery_end_command, if any.
 		 */
 		if (recoveryEndCommand)
+		{
+			elog(WARNING, "executing recovery_end_command");
 			ExecuteRecoveryCommand(recoveryEndCommand,
 								   "recovery_end_command",
 								   true);
+		}
 	}
 
 	if (ArchiveRecoveryRequested)
@@ -9027,14 +9033,31 @@ CreateEndOfRecoveryRecord(void)
 
 	XLogFlush(recptr);
 
+	//Maybe need to add an extra sleep here, the WAL insertion seems
+	//to be stuck because of the WAL insert lock taken by the restart
+	//point...
+
 	/*
 	 * Update the control file so that crash recovery can follow the timeline
 	 * changes to this point.
+	 *
+	 * Set minRecoveryPoint to InvalidXLogRecPtr so that a crash will force
+	 * redo recovery to the end of WAL. Otherwise a crash immediately after
+	 * promotion may lead to recovery to an inconsistent point and in the worst
+	 * case, recovery failing because of invalid page references (see
+	 * src/test/recovery/t/006_promotion_bug.pl).
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	elog(WARNING, "CreateEndOfRecoveryRecord: minRecoveryPoint is %X/%X before update",
+		 (uint32) (ControlFile->minRecoveryPoint >> 32),
+		 (uint32) ControlFile->minRecoveryPoint);
+
 	ControlFile->time = (pg_time_t) time(NULL);
-	ControlFile->minRecoveryPoint = recptr;
+	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
 	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
+	elog(WARNING, "CreateEndOfRecoveryRecord: minRecoveryPoint is %X/%X after update",
+		 (uint32) (ControlFile->minRecoveryPoint >> 32),
+		 (uint32) ControlFile->minRecoveryPoint);
 	UpdateControlFile();
 	LWLockRelease(ControlFileLock);
 
@@ -9147,6 +9170,7 @@ CreateRestartPoint(int flags)
 	 */
 	if (!RecoveryInProgress())
 	{
+		elog(WARNING, "restart point skipped");
 		ereport(DEBUG2,
 				(errmsg("skipping restartpoint, recovery has already ended")));
 		LWLockRelease(CheckpointLock);
@@ -9170,6 +9194,7 @@ CreateRestartPoint(int flags)
 	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
 		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
 	{
+		elog(WARNING, "restart point skipped as same point..");
 		ereport(DEBUG2,
 				(errmsg("skipping restartpoint, already performed at %X/%X",
 						(uint32) (lastCheckPoint.redo >> 32),
@@ -9227,6 +9252,12 @@ CreateRestartPoint(int flags)
 	 */
 	PriorRedoPtr = ControlFile->checkPointCopy.redo;
 
+	/* 5s, needs to be lower than restore_end_command! */
+	elog(WARNING, "waiting at sleep phase for CreateRestartPoint");
+	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
+		elog(WARNING, "control file uses DB_IN_ARCHIVE_RECOVERY");
+	pg_usleep(1000000L * 5);
+
 	/*
 	 * Update pg_control, using current time.  Check that it still shows
 	 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
@@ -9234,6 +9265,11 @@ CreateRestartPoint(int flags)
 	 * we get here after the end-of-recovery checkpoint.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	elog(WARNING, "checkPointCopy.redo = %X/%X, lastCheckPoint.redo = %X/%X",
+		 (uint32) (ControlFile->checkPointCopy.redo >> 32),
+		 (uint32) ControlFile->checkPointCopy.redo,
+		 (uint32) (lastCheckPoint.redo >> 32),
+		 (uint32) lastCheckPoint.redo);
 	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
 		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
 	{
@@ -9241,6 +9277,12 @@ CreateRestartPoint(int flags)
 		ControlFile->checkPointCopy = lastCheckPoint;
 		ControlFile->time = (pg_time_t) time(NULL);
 
+		elog(WARNING, "CreateRestartPoint: minRecoveryPoint is %X/%X before update, "
+			 "lastCheckPointEndPtr is %X/%X",
+			 (uint32) (ControlFile->minRecoveryPoint >> 32),
+			 (uint32) ControlFile->minRecoveryPoint,
+			 (uint32) (lastCheckPointEndPtr >> 32),
+			 (uint32) lastCheckPointEndPtr);
 		/*
 		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
 		 * this will have happened already while writing out dirty buffers,
@@ -9261,6 +9303,9 @@ CreateRestartPoint(int flags)
 			minRecoveryPoint = ControlFile->minRecoveryPoint;
 			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 		}
+		elog(WARNING, "CreateRestartPoint: minRecoveryPoint is %X/%X after update",
+			 (uint32) (ControlFile->minRecoveryPoint >> 32),
+			 (uint32) ControlFile->minRecoveryPoint);
 		if (flags & CHECKPOINT_IS_SHUTDOWN)
 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
 		UpdateControlFile();
diff --git a/src/test/recovery/t/014_promotion_bug.pl b/src/test/recovery/t/014_promotion_bug.pl
new file mode 100644
index 0000000000..85b275a40c
--- /dev/null
+++ b/src/test/recovery/t/014_promotion_bug.pl
@@ -0,0 +1,142 @@
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 2;
+
+# Initialize master node
+my $alpha = get_new_node('alpha');
+$alpha->init;
+$alpha->append_conf("postgresql.conf", <<EOF);
+max_wal_senders=5
+wal_level=replica
+min_wal_size=1GB
+max_wal_size=16GB
+hot_standby=on
+EOF
+
+$alpha->append_conf("pg_hba.conf", <<EOF);
+local	replication	all	trust
+EOF
+
+# Start the master
+$alpha->start;
+
+# setup/start a standby
+$alpha->backup('bkp');
+my $bravo = get_new_node('bravo');
+$bravo->init_from_backup($alpha, 'bkp', has_streaming => 1);
+$bravo->append_conf('postgresql.conf', <<EOF);
+checkpoint_timeout=1h
+checkpoint_completion_target=0.9
+EOF
+# Sleep at the end of recovery before the control file is updated
+# so as a checkpoint has the time to come in and update the
+# control file once again...
+my $bravo_datadir = $bravo->data_dir;
+my $trigger_file = "$bravo_datadir/bravo_trigger_file";
+$bravo->append_conf('recovery.conf', <<EOF);
+#recovery_end_command = 'sleep 10'
+trigger_file = '$trigger_file'
+EOF
+$bravo->start;
+
+$alpha->safe_psql('postgres', 'create table test1 (a int)');
+$alpha->safe_psql('postgres', 'insert into test1 select generate_series(1, 10000)');
+
+# take a checkpoint
+$alpha->safe_psql('postgres', 'checkpoint');
+
+# the following vacuum will set visibility map bits and create problematic WAL
+# records
+$alpha->safe_psql('postgres', 'vacuum verbose test1');
+sleep 10;
+
+# now force a checkpoint on the standby. This seems unnecessary but for "some"
+# reason, the previous checkpoint on the master does not reflect on the standby
+# and without an explicit checkpoint, it may start redo recovery from a much
+# older point (which includes even create table and initial page additions
+$bravo->safe_psql('postgres', 'checkpoint');
+
+# here the original runs pg_controldata
+
+# now just use a dummy table and run some operations to move minRecoveryPoint
+# beyond the previous vacuum
+$alpha->safe_psql('postgres', 'create table test2 (a int, b text)');
+$alpha->safe_psql('postgres', 'insert into test2 select generate_series(1,10000), md5(random()::text)');
+$alpha->safe_psql('postgres', 'truncate test2');
+# wait here so as last records can replay, allowing next restart point
+# to happen.
+# make next asynchronous restart point able to run and update the control
+# file.
+$alpha->safe_psql('postgres', 'checkpoint');
+# More data to move again minRecoveryPoint.
+$alpha->safe_psql('postgres', 'create table test3 (a int, b text)');
+$alpha->safe_psql('postgres', 'insert into test3 select generate_series(1,10000), md5(random()::text)');
+$alpha->safe_psql('postgres', 'truncate test3');
+sleep 2;
+
+# Generate a first restart point to make the second one going stale.
+# this needs to begin before promotion, so as this code canbe stuck
+# within CreateRestartPoint.
+$bravo->safe_psql('postgres', 'select 1');
+my $standby_checkpoint = IPC::Run::start(
+	[
+	 'psql', '-d', $bravo->connstr('postgres'),
+	 '-c', 'checkpoint;'
+	]);
+sleep 1;
+
+# Send SIGUSR1 to postmaster to trigger promotion. We want this part
+# to be asynchronous, but fetch first the postmaster PID and create the
+# trigger file.
+my $pid_file = "$bravo_datadir/postmaster.pid";
+open my $file, '<', $pid_file;
+my $bravo_pid = <$file>;
+chomp($bravo_pid);
+close $file;
+# trigger file
+open $file, '>', $trigger_file;
+close $file;
+my $ret = TestLib::system_log('pg_ctl', 'kill', 'USR1', $bravo_pid);
+is($ret, 0, "sent SIGUSR1 to bravo's postmaster's PID $bravo_pid");
+# Let time to reach wait after CreateEndOfRecoveryRecord before the
+# next restart point.
+sleep 2;
+
+# Let the time for the end-of-recovery process to finish.  The sleep
+# time should match the previous sleep time in restore_end_command but
+# that's not mandatory either.
+sleep 25;
+
+# Remove useless checkpoint. This is unfortunately too late as this has already
+# finished...
+$standby_checkpoint->kill_kill;
+
+# fun time.. truncate the table on the promoted standby, vacuum and extend it
+# again
+my $current_lsn =
+  $bravo->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+$bravo->safe_psql('postgres', "SELECT '$current_lsn'::pg_lsn;");
+$bravo->safe_psql('postgres', 'truncate test1');
+$bravo->safe_psql('postgres', 'vacuum verbose test1');
+$bravo->safe_psql('postgres', 'insert into test1 select generate_series(1,1000)');
+$current_lsn =
+  $bravo->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+$bravo->safe_psql('postgres', "SELECT '$current_lsn'::pg_lsn;");
+
+# Let the previous checkpoint enough time to finish...
+sleep 10;
+
+#
+# now crash-stop the promoted standby and restart. If timing is correct, you
+# should see a PANIC
+$bravo->stop('immediate');
+$bravo->start;
+
+my $psql_out;
+$bravo->psql(
+   'postgres',
+   "SELECT count(*) FROM test1",
+   stdout => \$psql_out);
+is($psql_out, '1000', "Check that table state is correct");

signature.asc
Description: PGP signature

Re: PANIC during crash recovery of a recently promoted standby

Reply via email to