From b230135e9f57d239260719deb88f8c1ed0e1e83c Mon Sep 17 00:00:00 2001
From: Zhijie Hou <houzj.fnst@fujitsu.com>
Date: Mon, 8 Dec 2025 13:09:44 +0800
Subject: [PATCH v9_PG17] Prevent invalidation of newly created replication
 slots.

A race condition could cause a newly created replication slot to become
invalidated between WAL reservation and a checkpoint.

Previously, if the required WAL was removed, we retried the reservation
process. However, the slot could still be invalidated before the retry if
the WAL was not yet removed but the checkpoint advanced the redo pointer
beyond the slot's intended restart LSN and computed the minimum LSN that
needs to be preserved for the slots.

The fix is to acquire an exclusive lock on ReplicationSlotAllocationLock during
WAL reservation, and a shared lock during the minimum LSN calculation at
checkpoints to serialize the process. This ensures that, if WAL reservation
occurs first, the checkpoint waits until restart_lsn is updated before
calculating the minimum LSN. If the checkpoint runs first, subsequent WAL
reservations pick a position at or after the latest checkpoint's redo pointer.
---
 src/backend/access/transam/xlog.c | 16 ++++++
 src/backend/replication/slot.c    | 94 ++++++++++++++++---------------
 2 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 17b88ff6453..5915e30d04d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7108,8 +7108,16 @@ CreateCheckPoint(int flags)
 	 * according to synchronized LSNs of replication slots.  The slot's LSN
 	 * might be advanced concurrently, so we call this before
 	 * CheckPointReplicationSlots() synchronizes replication slots.
+	 *
+	 * Additionally, acquiring the Allocation lock to serialize the minimum LSN
+	 * calculation with concurrent slot WAL reservation. This ensures that the
+	 * WAL position being reserved is either included in the miminum LSN or is
+	 * beyond or equal to the redo pointer of the current checkpoint (See
+	 * ReplicationSlotReserveWal for details).
 	 */
+	LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED);
 	slotsMinReqLSN = XLogGetReplicationSlotMinimumLSN();
+	LWLockRelease(ReplicationSlotAllocationLock);
 
 	/*
 	 * In some cases there are groups of actions that must all occur on one
@@ -7681,8 +7689,16 @@ CreateRestartPoint(int flags)
 	 * according to synchronized LSNs of replication slots.  The slot's LSN
 	 * might be advanced concurrently, so we call this before
 	 * CheckPointReplicationSlots() synchronizes replication slots.
+	 *
+	 * Additionally, acquiring the Allocation lock to serialize the minimum LSN
+	 * calculation with concurrent slot WAL reservation. This ensures that the
+	 * WAL position being reserved is either included in the miminum LSN or is
+	 * beyond or equal to the redo pointer of the current checkpoint (See
+	 * ReplicationSlotReserveWal for details).
 	 */
+	LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED);
 	slotsMinReqLSN = XLogGetReplicationSlotMinimumLSN();
+	LWLockRelease(ReplicationSlotAllocationLock);
 
 	if (log_checkpoints)
 		LogCheckpointStart(flags, true);
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 33b7da21a94..e4c6a805daf 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1430,62 +1430,64 @@ void
 ReplicationSlotReserveWal(void)
 {
 	ReplicationSlot *slot = MyReplicationSlot;
+	XLogSegNo	segno;
+	XLogRecPtr	restart_lsn;
 
 	Assert(slot != NULL);
 	Assert(slot->data.restart_lsn == InvalidXLogRecPtr);
 
 	/*
-	 * The replication slot mechanism is used to prevent removal of required
-	 * WAL. As there is no interlock between this routine and checkpoints, WAL
-	 * segments could concurrently be removed when a now stale return value of
-	 * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that
-	 * this happens we'll just retry.
+	 * The replication slot mechanism is used to prevent the removal of
+	 * required WAL.
+	 *
+	 * Acquire an exclusive lock to prevent the checkpoint process from
+	 * concurrently computing the minimum slot LSN (see the call to
+	 * XLogGetReplicationSlotMinimumLSN in CreateCheckPoint). This ensures that
+	 * the WAL reserved for replication cannot be removed during a checkpoint.
+	 *
+	 * The mechanism is reliable because if WAL reservation occurs first, the
+	 * checkpoint must wait for the restart_lsn update before determining the
+	 * minimum non-removable LSN. On the other hand, if the checkpoint happens
+	 * first, subsequent WAL reservations will select positions at or beyond
+	 * the redo pointer of that checkpoint.
 	 */
-	while (true)
-	{
-		XLogSegNo	segno;
-		XLogRecPtr	restart_lsn;
+	LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
 
-		/*
-		 * For logical slots log a standby snapshot and start logical decoding
-		 * at exactly that position. That allows the slot to start up more
-		 * quickly. But on a standby we cannot do WAL writes, so just use the
-		 * replay pointer; effectively, an attempt to create a logical slot on
-		 * standby will cause it to wait for an xl_running_xact record to be
-		 * logged independently on the primary, so that a snapshot can be
-		 * built using the record.
-		 *
-		 * None of this is needed (or indeed helpful) for physical slots as
-		 * they'll start replay at the last logged checkpoint anyway. Instead
-		 * return the location of the last redo LSN. While that slightly
-		 * increases the chance that we have to retry, it's where a base
-		 * backup has to start replay at.
-		 */
-		if (SlotIsPhysical(slot))
-			restart_lsn = GetRedoRecPtr();
-		else if (RecoveryInProgress())
-			restart_lsn = GetXLogReplayRecPtr(NULL);
-		else
-			restart_lsn = GetXLogInsertRecPtr();
+	/*
+	 * For logical slots log a standby snapshot and start logical decoding at
+	 * exactly that position. That allows the slot to start up more quickly.
+	 * But on a standby we cannot do WAL writes, so just use the replay
+	 * pointer; effectively, an attempt to create a logical slot on standby
+	 * will cause it to wait for an xl_running_xact record to be logged
+	 * independently on the primary, so that a snapshot can be built using the
+	 * record.
+	 *
+	 * None of this is needed (or indeed helpful) for physical slots as
+	 * they'll start replay at the last logged checkpoint anyway. Instead,
+	 * return the location of the last redo LSN, where a base backup has to
+	 * start replay at.
+	 */
+	if (SlotIsPhysical(slot))
+		restart_lsn = GetRedoRecPtr();
+	else if (RecoveryInProgress())
+		restart_lsn = GetXLogReplayRecPtr(NULL);
+	else
+		restart_lsn = GetXLogInsertRecPtr();
 
-		SpinLockAcquire(&slot->mutex);
-		slot->data.restart_lsn = restart_lsn;
-		SpinLockRelease(&slot->mutex);
+	SpinLockAcquire(&slot->mutex);
+	slot->data.restart_lsn = restart_lsn;
+	SpinLockRelease(&slot->mutex);
 
-		/* prevent WAL removal as fast as possible */
-		ReplicationSlotsComputeRequiredLSN();
+	/* prevent WAL removal as fast as possible */
+	ReplicationSlotsComputeRequiredLSN();
 
-		/*
-		 * If all required WAL is still there, great, otherwise retry. The
-		 * slot should prevent further removal of WAL, unless there's a
-		 * concurrent ReplicationSlotsComputeRequiredLSN() after we've written
-		 * the new restart_lsn above, so normally we should never need to loop
-		 * more than twice.
-		 */
-		XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
-		if (XLogGetLastRemovedSegno() < segno)
-			break;
-	}
+	/* Checkpoint shouldn't remove the required WAL. */
+	XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
+	if (XLogGetLastRemovedSegno() >= segno)
+		elog(ERROR, "WAL required by replication slot %s has been removed concurrently",
+			 NameStr(slot->data.name));
+
+	LWLockRelease(ReplicationSlotAllocationLock);
 
 	if (!RecoveryInProgress() && SlotIsLogical(slot))
 	{
-- 
2.51.1.windows.1

