From e7c2e425eba642c8e9c379c5fecc4bd5caf28997 Mon Sep 17 00:00:00 2001
From: "Sami Imseih (AWS)" <simseih@amazon.com>
Date: Tue, 22 Feb 2022 19:09:36 +0000
Subject: [PATCH 1/1] Fix "missing continuation record" after standby promotion

Fix a condition where a recently promoted standby attempts to
write an OVERWRITE_RECORD with an LSN of the previously read
aborted record.
---
 src/backend/access/transam/xlog.c             |  16 ++-
 ...inuation-record-after-standby-promot.patch | 134 ++++++++++++++++++
 2 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 src/test/recovery/0001-Fix-missing-continuation-record-after-standby-promot.patch

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d2bd7a357..56c2fdec96 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5423,11 +5423,25 @@ StartupXLOG(void)
 	 * made it through and start writing after the portion that persisted.
 	 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
 	 * we'll do as soon as we're open for writing new WAL.)
+	 *
+	 * If the last wal record is ahead of the missing contrecord, this is
+	 * a recently promoted primary and we should not write an overwrite
+	 * contrecord.
 	 */
 	if (!XLogRecPtrIsInvalid(missingContrecPtr))
 	{
 		Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
-		EndOfLog = missingContrecPtr;
+		if (endOfRecoveryInfo->lastRec < missingContrecPtr)
+		{
+			elog(DEBUG2, "setting end of wal to missing continuation record %X/%X",
+						LSN_FORMAT_ARGS(missingContrecPtr));
+			EndOfLog = missingContrecPtr;
+		}
+		else
+		{
+			elog(DEBUG2, "resetting aborted record");
+			abortedRecPtr = InvalidXLogRecPtr;
+		}
 	}
 
 	/*
diff --git a/src/test/recovery/0001-Fix-missing-continuation-record-after-standby-promot.patch b/src/test/recovery/0001-Fix-missing-continuation-record-after-standby-promot.patch
new file mode 100644
index 0000000000..40d922801b
--- /dev/null
+++ b/src/test/recovery/0001-Fix-missing-continuation-record-after-standby-promot.patch
@@ -0,0 +1,134 @@
+From cb344355facb3e6f793013b0b9998683277f3bd8 Mon Sep 17 00:00:00 2001
+From: "Sami Imseih (AWS)" <simseih@amazon.com>
+Date: Tue, 22 Feb 2022 18:59:44 +0000
+Subject: [PATCH 1/1] Fix "missing continuation record" after standby
+ promotion.
+
+Fix a condition where a recently promoted standby attempts to
+write an OVERWRITE_RECORD with an LSN of the previously read
+aborted record.
+---
+ .../t/029_overwrite_contrecord_promotion.pl   | 111 ++++++++++++++++++
+ 1 file changed, 111 insertions(+)
+ create mode 100644 src/test/recovery/t/029_overwrite_contrecord_promotion.pl
+
+diff --git a/src/test/recovery/t/029_overwrite_contrecord_promotion.pl b/src/test/recovery/t/029_overwrite_contrecord_promotion.pl
+new file mode 100644
+index 0000000000..ea4ebb32c0
+--- /dev/null
++++ b/src/test/recovery/t/029_overwrite_contrecord_promotion.pl
+@@ -0,0 +1,111 @@
++# Copyright (c) 2021-2022, PostgreSQL Global Development Group
++
++# Tests for resetting the "aborted record" after a promotion.
++
++use strict;
++use warnings;
++
++use FindBin;
++use PostgreSQL::Test::Cluster;
++use PostgreSQL::Test::Utils;
++use Test::More;
++
++# Test: Create a physical replica that's missing the last WAL file,
++# then restart the primary to create a divergent WAL file and observe
++# that the replica resets the "aborted record" after a promotion.
++
++my $node = PostgreSQL::Test::Cluster->new('primary');
++$node->init(allows_streaming => 1);
++# We need these settings for stability of WAL behavior.
++$node->append_conf(
++	'postgresql.conf', qq(
++autovacuum = off
++wal_keep_size = 1GB
++log_min_messages = DEBUG2
++));
++$node->start;
++
++$node->safe_psql('postgres', 'create table filler (a int, b text)');
++
++# Now consume all remaining room in the current WAL segment, leaving
++# space enough only for the start of a largish record.
++$node->safe_psql(
++	'postgres', q{
++DO $$
++DECLARE
++    wal_segsize int := setting::int FROM pg_settings WHERE name = 'wal_segment_size';
++    remain int;
++    iters  int := 0;
++BEGIN
++    LOOP
++        INSERT into filler
++        select g, repeat(md5(g::text), (random() * 60 + 1)::int)
++        from generate_series(1, 10) g;
++
++        remain := wal_segsize - (pg_current_wal_insert_lsn() - '0/0') % wal_segsize;
++        IF remain < 2 * setting::int from pg_settings where name = 'block_size' THEN
++            RAISE log 'exiting after % iterations, % bytes to end of WAL segment', iters, remain;
++            EXIT;
++        END IF;
++        iters := iters + 1;
++    END LOOP;
++END
++$$;
++});
++
++my $initfile = $node->safe_psql('postgres',
++	'SELECT pg_walfile_name(pg_current_wal_insert_lsn())');
++$node->safe_psql('postgres',
++	qq{SELECT pg_logical_emit_message(true, 'test 026', repeat('xyzxz', 123456))}
++);
++#$node->safe_psql('postgres', qq{create table foo ()});
++my $endfile = $node->safe_psql('postgres',
++	'SELECT pg_walfile_name(pg_current_wal_insert_lsn())');
++ok($initfile ne $endfile, "$initfile differs from $endfile");
++
++# Now stop abruptly, to avoid a stop checkpoint.  We can remove the tail file
++# afterwards, and on startup the large message should be overwritten with new
++# contents
++$node->stop('immediate');
++
++unlink $node->basedir . "/pgdata/pg_wal/$endfile"
++  or die "could not unlink " . $node->basedir . "/pgdata/pg_wal/$endfile: $!";
++
++# OK, create a standby at this spot.
++$node->backup_fs_cold('backup');
++my $node_standby = PostgreSQL::Test::Cluster->new('standby');
++$node_standby->init_from_backup($node, 'backup', has_streaming => 1);
++
++$node_standby->start;
++$node->start;
++
++$node->safe_psql('postgres',
++	qq{create table foo (a text); insert into foo values ('hello')});
++$node->safe_psql('postgres',
++	qq{SELECT pg_logical_emit_message(true, 'test 026', 'AABBCC')});
++
++my $until_lsn = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
++my $caughtup_query =
++  "SELECT '$until_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
++$node_standby->poll_query_until('postgres', $caughtup_query)
++  or die "Timed out while waiting for standby to catch up";
++
++ok($node_standby->safe_psql('postgres', 'select * from foo') eq 'hello',
++	'standby replays past overwritten contrecord');
++
++$ENV{PGDATA} = $node_standby->data_dir;
++$ENV{PGPORT} = $node_standby->port;
++$ENV{PGGHOST} = $node_standby->host;
++system "psql -c 'select pg_promote()'";
++
++# Verify message appears in standby's log
++my $log = slurp_file($node_standby->logfile);
++like(
++    $log,
++    qr[resetting aborted record],
++    "found log line in standby");
++
++$node->stop;
++$node_standby->stop;
++
++done_testing();
+-- 
+2.32.0
+
-- 
2.32.0

