Hi, On Fri, Aug 9, 2024 at 4:29 PM Dmitry Dolgov <9erthali...@gmail.com> wrote:
> Seems like the retry loop from 019_replslot_limit might help. > Thanks for the tip. Attached v2 adds the retry loop in the test which would hopefully fix the cfbot. Kind Regards, Stefan
From 203747394503b479338980948ccb3df84dc8d1a1 Mon Sep 17 00:00:00 2001 From: Stefan Fercot <pgs...@fercot.be> Date: Fri, 5 Apr 2024 10:57:03 +0200 Subject: [PATCH v2.] Make XLogFileRead try to restore .partial wal archives Try to restore the normal archived wal segment first and, if not found, then try to restore the archived .partial wal segment. This is safe because the next completed wal segment should contain at least the same data. --- src/backend/access/transam/xlogrecovery.c | 20 +++++++++- src/test/recovery/t/028_pitr_timelines.pl | 47 ++++++++++++++++++++++- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 2ed3ea2b45..c8e793e3d9 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -4196,6 +4196,8 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, char activitymsg[MAXFNAMELEN + 16]; char path[MAXPGPATH]; int fd; + char *partialxlogfname; + bool restoredArchivedFile; XLogFileName(xlogfname, tli, segno, wal_segment_size); @@ -4207,10 +4209,24 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, xlogfname); set_ps_display(activitymsg); - if (!RestoreArchivedFile(path, xlogfname, + /* + * Try to restore the normal wal segment first and, if not found, + * then try to restore the .partial wal segment. + */ + + partialxlogfname = psprintf("%s.partial", xlogfname); + + restoredArchivedFile = !RestoreArchivedFile(path, xlogfname, + "RECOVERYXLOG", + wal_segment_size, + InRedo) && + !RestoreArchivedFile(path, partialxlogfname, "RECOVERYXLOG", wal_segment_size, - InRedo)) + InRedo); + + pfree(partialxlogfname); + if (restoredArchivedFile) return -1; break; diff --git a/src/test/recovery/t/028_pitr_timelines.pl b/src/test/recovery/t/028_pitr_timelines.pl index 4b7d825b71..a958a0c6e7 100644 --- a/src/test/recovery/t/028_pitr_timelines.pl +++ b/src/test/recovery/t/028_pitr_timelines.pl @@ -110,7 +110,7 @@ $node_standby->stop; # segment 000000020000000000000003, before the timeline switching # record. (They are also present in the # 000000010000000000000003.partial file, but .partial files are not -# used automatically.) +# used when recovering along the latest timeline by default.) # Now test PITR to the recovery target. It should find the WAL in # segment 000000020000000000000003, but not follow the timeline switch @@ -173,4 +173,49 @@ $node_pitr2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") $result = $node_pitr2->safe_psql('postgres', "SELECT max(i) FROM foo;"); is($result, qq{3}, "check table contents after point-in-time recovery"); +# The 000000010000000000000003.partial file could have been generated +# by pg_receivewal without any standby node involved. In this case, we +# wouldn't be able to recover from 000000020000000000000003. +# Now, test PITR to the initial recovery target staying on the backup's +# current timeline, trying to fetch the data from the +# 000000010000000000000003.partial file. + +my $node_pitr3 = PostgreSQL::Test::Cluster->new('node_pitr3'); +$node_pitr3->init_from_backup( + $node_primary, $backup_name, + standby => 0, + has_restoring => 1); +$node_pitr3->append_conf( + 'postgresql.conf', qq{ +recovery_target_name = 'rp' +recovery_target_action = 'promote' +recovery_target_timeline = 'current' +}); + +my $log_offset = -s $node_pitr3->logfile; +$node_pitr3->start; + +my $msg_logged = 0; +my $max_attempts = $PostgreSQL::Test::Utils::timeout_default; +while ($max_attempts-- >= 0) +{ + if ($node_pitr3->log_contains( + "restored log file \"000000010000000000000003.partial\" from archive", + $log_offset)) + { + $msg_logged = 1; + last; + } + sleep 1; +} +ok($msg_logged, "restored 000000010000000000000003.partial"); + +# Wait until recovery finishes. +$node_pitr3->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") + or die "Timed out while waiting for PITR promotion"; + +# Check that we see the data we expect. +$result = $node_pitr3->safe_psql('postgres', "SELECT max(i) FROM foo;"); +is($result, qq{1}, "check table contents after point-in-time recovery"); + done_testing(); -- 2.34.1