From fe323d4aedcf8077b92e754a7348e538d658c886 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilipkumarb@google.com>
Date: Wed, 9 Jul 2025 05:36:56 +0000
Subject: [PATCH v2] Better way to prevent wal removal and slot invalidation
 during Binary upgrade

The existing method relies on a GUC hook to check the new parameter values,
issuing an error if they could potentially lead to WAL removal or slot
invalidation. This happens irrespective of the parameter's ultimate configuration.
So this may lead to error in some non problamatic cases as well.

Our improved solution directly prevents WAL removal and slot invalidation
in server when it is running in binary upgrade mode. This eliminates the need for
the GUC check hook and removes the prerequisite of starting the server with
particular values during the upgrade process.
---
 src/backend/access/transam/xlog.c   | 28 +++++++------------------
 src/backend/replication/slot.c      | 32 ++++-------------------------
 src/backend/utils/misc/guc_tables.c |  4 ++--
 src/bin/pg_upgrade/server.c         | 18 ----------------
 src/include/utils/guc_hooks.h       |  4 ----
 5 files changed, 13 insertions(+), 73 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a8cc6402d62..91568024aed 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2346,25 +2346,6 @@ check_wal_segment_size(int *newval, void **extra, GucSource source)
 	return true;
 }
 
-/*
- * GUC check_hook for max_slot_wal_keep_size
- *
- * We don't allow the value of max_slot_wal_keep_size other than -1 during the
- * binary upgrade. See start_postmaster() in pg_upgrade for more details.
- */
-bool
-check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source)
-{
-	if (IsBinaryUpgrade && *newval != -1)
-	{
-		GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.",
-							"max_slot_wal_keep_size");
-		return false;
-	}
-
-	return true;
-}
-
 /*
  * At a checkpoint, how many WAL segments to recycle as preallocated future
  * XLOG segments? Returns the highest segment that should be preallocated.
@@ -8159,11 +8140,16 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
 	{
 		XLByteToSeg(keep, segno, wal_segment_size);
 
-		/* Cap by max_slot_wal_keep_size ... */
-		if (max_slot_wal_keep_size_mb >= 0)
+		/*
+		 * Avoid  WAL removal by the checkpointer process during Binary
+		 * upgrade.  If WALs required by logical replication slots are removed,
+		 * the slots are unusable.
+		 */
+		if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade)
 		{
 			uint64		slot_keep_segs;
 
+			/* Cap by max_slot_wal_keep_size ... */
 			slot_keep_segs =
 				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
 
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index f369fce2485..82c7d263f94 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1890,15 +1890,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
 
 		SpinLockRelease(&s->mutex);
 
-		/*
-		 * The logical replication slots shouldn't be invalidated as GUC
-		 * max_slot_wal_keep_size is set to -1 and
-		 * idle_replication_slot_timeout is set to 0 during the binary
-		 * upgrade. See check_old_cluster_for_valid_slots() where we ensure
-		 * that no invalidated before the upgrade.
-		 */
-		Assert(!(*invalidated && SlotIsLogical(s) && IsBinaryUpgrade));
-
 		/*
 		 * Calculate the idle time duration of the slot if slot is marked
 		 * invalidated with RS_INVAL_IDLE_TIMEOUT.
@@ -2045,6 +2036,10 @@ restart:
 		if (!s->in_use)
 			continue;
 
+		/* Prevent logical slot invalidation during binary upgrade. */
+		if (SlotIsLogical(s) && IsBinaryUpgrade)
+			continue;
+
 		if (InvalidatePossiblyObsoleteSlot(possible_causes, s, oldestLSN, dboid,
 										   snapshotConflictHorizon,
 										   &invalidated))
@@ -3057,22 +3052,3 @@ WaitForStandbyConfirmation(XLogRecPtr wait_for_lsn)
 
 	ConditionVariableCancelSleep();
 }
-
-/*
- * GUC check_hook for idle_replication_slot_timeout
- *
- * The value of idle_replication_slot_timeout must be set to 0 during
- * a binary upgrade. See start_postmaster() in pg_upgrade for more details.
- */
-bool
-check_idle_replication_slot_timeout(int *newval, void **extra, GucSource source)
-{
-	if (IsBinaryUpgrade && *newval != 0)
-	{
-		GUC_check_errdetail("\"%s\" must be set to 0 during binary upgrade mode.",
-							"idle_replication_slot_timeout");
-		return false;
-	}
-
-	return true;
-}
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 511dc32d519..e3f41d44185 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3081,7 +3081,7 @@ struct config_int ConfigureNamesInt[] =
 		},
 		&max_slot_wal_keep_size_mb,
 		-1, -1, MAX_KILOBYTES,
-		check_max_slot_wal_keep_size, NULL, NULL
+		NULL, NULL, NULL
 	},
 
 	{
@@ -3104,7 +3104,7 @@ struct config_int ConfigureNamesInt[] =
 		},
 		&idle_replication_slot_timeout_mins,
 		0, 0, INT_MAX / SECS_PER_MINUTE,
-		check_idle_replication_slot_timeout, NULL, NULL
+		NULL, NULL, NULL
 	},
 
 	{
diff --git a/src/bin/pg_upgrade/server.c b/src/bin/pg_upgrade/server.c
index 873e5b5117b..7eb15bc7d5a 100644
--- a/src/bin/pg_upgrade/server.c
+++ b/src/bin/pg_upgrade/server.c
@@ -241,24 +241,6 @@ start_postmaster(ClusterInfo *cluster, bool report_and_exit_on_error)
 	if (cluster == &new_cluster)
 		appendPQExpBufferStr(&pgoptions, " -c synchronous_commit=off -c fsync=off -c full_page_writes=off");
 
-	/*
-	 * Use max_slot_wal_keep_size as -1 to prevent the WAL removal by the
-	 * checkpointer process.  If WALs required by logical replication slots
-	 * are removed, the slots are unusable.  This setting prevents the
-	 * invalidation of slots during the upgrade. We set this option when
-	 * cluster is PG17 or later because logical replication slots can only be
-	 * migrated since then. Besides, max_slot_wal_keep_size is added in PG13.
-	 */
-	if (GET_MAJOR_VERSION(cluster->major_version) >= 1700)
-		appendPQExpBufferStr(&pgoptions, " -c max_slot_wal_keep_size=-1");
-
-	/*
-	 * Use idle_replication_slot_timeout=0 to prevent slot invalidation due to
-	 * idle_timeout by checkpointer process during upgrade.
-	 */
-	if (GET_MAJOR_VERSION(cluster->major_version) >= 1800)
-		appendPQExpBufferStr(&pgoptions, " -c idle_replication_slot_timeout=0");
-
 	/*
 	 * Use -b to disable autovacuum and logical replication launcher
 	 * (effective in PG17 or later for the latter).
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 799fa7ace68..82ac8646a8d 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -84,8 +84,6 @@ extern const char *show_log_timezone(void);
 extern void assign_maintenance_io_concurrency(int newval, void *extra);
 extern void assign_io_max_combine_limit(int newval, void *extra);
 extern void assign_io_combine_limit(int newval, void *extra);
-extern bool check_max_slot_wal_keep_size(int *newval, void **extra,
-										 GucSource source);
 extern void assign_max_wal_size(int newval, void *extra);
 extern bool check_max_stack_depth(int *newval, void **extra, GucSource source);
 extern void assign_max_stack_depth(int newval, void *extra);
@@ -176,7 +174,5 @@ extern void assign_wal_sync_method(int new_wal_sync_method, void *extra);
 extern bool check_synchronized_standby_slots(char **newval, void **extra,
 											 GucSource source);
 extern void assign_synchronized_standby_slots(const char *newval, void *extra);
-extern bool check_idle_replication_slot_timeout(int *newval, void **extra,
-												GucSource source);
 
 #endif							/* GUC_HOOKS_H */
-- 
2.50.0.727.gbf7dc18ff4-goog

