From 9930c1856d1e8e2f027276d6599c1eff7866b9db Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Fri, 20 Feb 2026 00:05:03 +0500
Subject: [PATCH] Fix archive recovery falling back to wrong-timeline WAL
 segment

XLogFileReadAnyTLI iterates expectedTLEs newest-first and, when the
correct timeline's segment is absent from the archive, falls back to an
older timeline's segment for the same position.  Past a switch point
that older segment carries divergent WAL, so recovery silently applies
wrong data.

The correct invariant: for any segment, the owner is the newest
timeline in expectedTLEs whose begin_seg <= segno.  If that segment is
absent, recovery must stop rather than fall back.

Replace the targetBeginSeg pre-check (which only guarded the final
switch point) with a found_eligible flag: once the first eligible
timeline is identified and its segment is not found, break out of the
loop instead of continuing to older timelines.

Add two TAP tests: 052 covers the basic two-timeline case; 053 covers
a three-timeline chain where the intermediate switch-point segment is
absent, which the old targetBeginSeg approach did not catch.
---
 src/backend/access/transam/xlogrecovery.c     |  14 ++
 .../052_timeline_switch_archive_divergence.pl |  92 ++++++++++++
 .../t/053_timeline_switch_intermediate_tl.pl  | 142 ++++++++++++++++++
 3 files changed, 248 insertions(+)
 create mode 100644 src/test/recovery/t/052_timeline_switch_archive_divergence.pl
 create mode 100644 src/test/recovery/t/053_timeline_switch_intermediate_tl.pl

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index c0c2744d45b..b4c4e87e0fb 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -4345,6 +4345,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
 	ListCell   *cell;
 	int			fd;
 	List	   *tles;
+	bool		found_eligible;
 
 	/*
 	 * Loop looking for a suitable timeline ID: we might need to read any of
@@ -4369,6 +4370,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
 	else
 		tles = readTimeLineHistory(recoveryTargetTLI);
 
+	found_eligible = false;
 	foreach(cell, tles)
 	{
 		TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
@@ -4401,6 +4403,18 @@ XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
 				continue;
 		}
 
+		/*
+		 * This is the first (newest) timeline eligible for this segment.
+		 * Older timelines that also pass the beginseg check have divergent
+		 * WAL starting at their own switch point: once a child timeline
+		 * branches off, the parent's WAL is no longer valid for the child's
+		 * recovery path.  If the correct timeline's segment isn't available,
+		 * we must not silently fall back to a parent with wrong data.
+		 */
+		if (found_eligible)
+			break;
+		found_eligible = true;
+
 		if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
 		{
 			fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
diff --git a/src/test/recovery/t/052_timeline_switch_archive_divergence.pl b/src/test/recovery/t/052_timeline_switch_archive_divergence.pl
new file mode 100644
index 00000000000..aef9569a046
--- /dev/null
+++ b/src/test/recovery/t/052_timeline_switch_archive_divergence.pl
@@ -0,0 +1,92 @@
+
+# Copyright (c) 2021-2026, PostgreSQL Global Development Group
+
+# Test that archive recovery with recovery_target_timeline='latest' does not
+# use a parent timeline's WAL segment for the segment containing the switch
+# point when the child timeline's segment is absent from the archive.
+#
+# Setup: TL1 archives segments 1..3 (segment 3 has data past the switch point).
+# Only the TL2 timeline history file is added to archive; TL2 segment 3 is not.
+# Recovery is performed with recovery_target_timeline = '2'.  Without the fix,
+# recovery uses TL1 segment 3 even though the switch point is in TL2 segment 3.
+# With the fix, recovery skips TL1 for that segment and correctly waits for TL2.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use File::Copy;
+
+# Initialize primary with WAL archiving.
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1, has_archiving => 1);
+$node_primary->start;
+
+$node_primary->safe_psql('postgres', 'CREATE TABLE t (i int)');
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Create a streaming standby so we can promote it.  Disable archiving so
+# it does not inherit the primary's archive_command and archive TL2 segments
+# into the primary's archive (which would mask the bug by making TL2 seg 3
+# available when it should not be).
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1);
+$node_standby->append_conf('postgresql.conf', "archive_mode = off");
+$node_standby->start;
+$node_primary->wait_for_catchup($node_standby);
+
+# Force a segment boundary: switch to segment 3, so the switch point will
+# land inside segment 3 (both TL1 and TL2 will have a segment 3).
+$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (1)');
+$node_primary->wait_for_catchup($node_standby);
+
+# Promote standby to TL2.  The timeline history file (00000002.history) is
+# written to pg_wal immediately upon promotion.
+$node_standby->promote;
+$node_standby->poll_query_until('postgres', 'SELECT NOT pg_is_in_recovery()')
+  or die "Timed out waiting for promotion";
+
+# Old primary writes to segment 3 and archives it.  This segment overlaps
+# the switch point but is on TL1 -- recovery must NOT use it for TL2.
+$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (-1)');
+my $old_walfile = $node_primary->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+$node_primary->poll_query_until('postgres',
+	"SELECT last_archived_wal >= '$old_walfile' FROM pg_stat_archiver")
+  or die "Timed out waiting for old primary to archive";
+
+# Add TL2 timeline history to the archive by copying it directly from the
+# standby's pg_wal.  This tells recovery that TL2 exists, but TL2 segment 3
+# is intentionally absent so recovery must not fall back to TL1 segment 3.
+my $archive = $node_primary->archive_dir;
+copy($node_standby->data_dir . '/pg_wal/00000002.history',
+	"$archive/00000002.history")
+  or die "Could not copy 00000002.history: $!";
+
+# Stop both nodes.  Recovery will run archive-only, no streaming source.
+$node_primary->stop;
+$node_standby->stop;
+
+# Create a recovery node using old primary's archive only (no streaming).
+my $node_rec = PostgreSQL::Test::Cluster->new('recovering');
+$node_rec->init_from_backup($node_primary, $backup_name, has_restoring => 1);
+$node_rec->enable_restoring($node_primary, 1);
+$node_rec->append_conf('postgresql.conf', "recovery_target_timeline = '2'");
+
+$node_rec->start;
+
+# Give recovery a moment to attempt restoring the switch-point segment.
+$node_rec->poll_query_until('postgres', 'SELECT pg_is_in_recovery()', 't')
+  or die "Node is not in recovery";
+
+# With the fix: recovery skips TL1 for the switch-point segment and waits
+# for TL2 (which is absent).  Without the fix: it restores TL1 segment 3.
+my $log_content = slurp_file($node_rec->logfile);
+unlike($log_content, qr/restored log file "000000010000000000000003"/,
+	'archive recovery did not use TL1 segment 3 past the switch point');
+
+done_testing();
diff --git a/src/test/recovery/t/053_timeline_switch_intermediate_tl.pl b/src/test/recovery/t/053_timeline_switch_intermediate_tl.pl
new file mode 100644
index 00000000000..9801ebda2c4
--- /dev/null
+++ b/src/test/recovery/t/053_timeline_switch_intermediate_tl.pl
@@ -0,0 +1,142 @@
+
+# Copyright (c) 2021-2026, PostgreSQL Global Development Group
+
+# Test that archive recovery with three timelines does not fall back to a
+# grandparent TL segment when the intermediate TL segment at its own switch
+# point is absent from the archive.
+#
+# Topology: TL1 -> TL2 (switch in seg N+1) -> TL3 (switch in seg N+2).
+# The base backup is taken while the primary is still in segment N.
+#
+# Archive contains TL1 segments N, N+1, N+2 and the TL3 history file, but
+# NOT TL2 segment N+1 (the segment containing the TL1->TL2 switch point).
+#
+# Recovery target is TL3.  For segment N+1, TL2 is the first eligible timeline
+# (its begin_seg == N+1).  TL2 segment N+1 is absent, so a correct
+# implementation must stop there and not fall back to TL1 segment N+1, which
+# carries divergent WAL from the old primary after the switch.
+#
+# A fix that only guards the final timeline's switch point (checking
+# targetBeginSeg = TL3.begin_seg = N+2) would still allow TL1 segment N+1
+# to be used because segno N+1 < targetBeginSeg N+2.  The correct fix must
+# stop at the first eligible timeline for each segment: once TL2 is identified
+# as eligible for segment N+1 but not found, recovery must not try TL1.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use File::Copy;
+
+# Primary (TL1) with WAL archiving.
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1, has_archiving => 1);
+$node_primary->start;
+$node_primary->safe_psql('postgres', 'CREATE TABLE t (i int)');
+
+# Take the base backup in segment N.  Recovery will start from this backup
+# and must replay segment N+1 where the TL2 switch point lands.
+$node_primary->backup('primary_backup');
+
+# Switch to segment N+1.  This is where the TL1->TL2 switch point will land.
+$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (1)');
+
+# Capture the TL1 file name for segment N+1.  We will assert later that
+# archive recovery did NOT restore this file (the TL1 version carries
+# divergent WAL past the switch point).
+my $tl2_switch_seg = $node_primary->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+
+# standby1 streams from primary.  Disable archiving so it does not publish
+# TL2 segments into the primary's archive, which would mask the bug.
+my $node_standby1 = PostgreSQL::Test::Cluster->new('standby1');
+$node_standby1->init_from_backup($node_primary, 'primary_backup',
+	has_streaming => 1);
+$node_standby1->append_conf('postgresql.conf', "archive_mode = off");
+$node_standby1->start;
+$node_primary->wait_for_catchup($node_standby1);
+
+# Promote standby1 while the primary is writing in segment N+1.  The TL1->TL2
+# switch point therefore falls inside segment N+1.
+$node_standby1->promote;
+$node_standby1->poll_query_until('postgres', 'SELECT NOT pg_is_in_recovery()')
+  or die "Timed out waiting for standby1 promotion to TL2";
+
+# Old primary (TL1) continues writing divergent WAL in segment N+1 and then
+# N+2 and archives them.  These are the segments that recovery must not use.
+$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (-1)');
+$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+$node_primary->safe_psql('postgres', 'INSERT INTO t VALUES (-2)');
+my $tl1_last_seg = $node_primary->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+$node_primary->poll_query_until('postgres',
+	"SELECT last_archived_wal >= '$tl1_last_seg' FROM pg_stat_archiver")
+  or die "Timed out waiting for primary to archive divergent TL1 segments";
+
+# On TL2 (standby1 is now the primary), switch to segment N+2 so the TL2->TL3
+# switch point lands one segment further than the TL1->TL2 switch.
+$node_standby1->safe_psql('postgres', 'SELECT pg_switch_wal()');
+$node_standby1->safe_psql('postgres', 'INSERT INTO t VALUES (2)');
+
+# Take a backup from the TL2 primary for standby2.
+$node_standby1->backup('standby1_backup');
+
+# standby2 streams from standby1 (TL2).  Same archiving restriction.
+my $node_standby2 = PostgreSQL::Test::Cluster->new('standby2');
+$node_standby2->init_from_backup($node_standby1, 'standby1_backup',
+	has_streaming => 1);
+$node_standby2->append_conf('postgresql.conf', "archive_mode = off");
+$node_standby2->start;
+$node_standby1->wait_for_catchup($node_standby2);
+
+# Promote standby2 while TL2 is writing in segment N+2.  The TL2->TL3 switch
+# point therefore falls inside segment N+2.
+$node_standby2->promote;
+$node_standby2->poll_query_until('postgres', 'SELECT NOT pg_is_in_recovery()')
+  or die "Timed out waiting for standby2 promotion to TL3";
+
+$node_primary->stop;
+$node_standby1->stop;
+$node_standby2->stop;
+
+# Copy the TL3 history file to the TL1 archive.  PostgreSQL builds it with
+# the full ancestry chain upon promotion:
+#   1  <TL2 switch lsn>   (switch from TL1 to TL2 happened in segment N+1)
+#   2  <TL3 switch lsn>   (switch from TL2 to TL3 happened in segment N+2)
+# TL2 segment N+1 is intentionally absent from the archive: standby1 had
+# archive_mode=off so it never wrote TL2 segments there.  Only the TL1
+# version of segment N+1 is present.
+my $archive = $node_primary->archive_dir;
+copy($node_standby2->data_dir . '/pg_wal/00000003.history',
+	"$archive/00000003.history")
+  or die "Could not copy 00000003.history: $!";
+
+# Build a recovery node from the TL1 base backup, replaying from the TL1
+# archive only (no streaming).
+my $node_rec = PostgreSQL::Test::Cluster->new('recovering');
+$node_rec->init_from_backup($node_primary, 'primary_backup', has_restoring => 1);
+$node_rec->enable_restoring($node_primary, 1);
+$node_rec->append_conf('postgresql.conf', "recovery_target_timeline = '3'");
+$node_rec->start;
+
+$node_rec->poll_query_until('postgres', 'SELECT pg_is_in_recovery()', 't')
+  or die "Node is not in recovery";
+
+# With the fix: for segment N+1, TL2 is the first eligible timeline (its
+# begin_seg == N+1).  TL2 segment N+1 is absent, so recovery stops and does
+# not fall back to the TL1 version.
+#
+# Without the fix (targetBeginSeg approach): targetBeginSeg == N+2 (TL3's
+# begin), and segno N+1 < N+2, so the guard does not fire.  Recovery falls
+# through and silently applies TL1 segment N+1, which carries divergent data.
+my $log_content = slurp_file($node_rec->logfile);
+unlike(
+	$log_content,
+	qr/restored log file "$tl2_switch_seg"/,
+	"archive recovery did not use TL1 segment at TL2 switch point ($tl2_switch_seg)"
+);
+
+done_testing();
-- 
2.51.2

