From 0a748898d368c84c29950bf45da01a846a637728 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilipkumarb@google.com>
Date: Thu, 10 Jul 2025 06:04:49 +0000
Subject: [PATCH v3] Better way to prevent wal removal during binary upgrade

The existing method relies on a GUC hook to check the new parameter values,
issuing an error if they could potentially lead to WAL removal. This happens
irrespective of the parameter's ultimate configuration.  So this may lead to
error in some non problamatic cases as well.

Our improved solution directly prevents WAL removal in server when it is running
in binary upgrade mode. This eliminates the need for the GUC check hook and removes
the prerequisite of starting the server with particular values during the upgrade
process.
---
 src/backend/access/transam/xlog.c   | 33 +++++++----------------------
 src/backend/replication/slot.c      | 12 ++++-------
 src/backend/utils/misc/guc_tables.c |  2 +-
 src/bin/pg_upgrade/server.c         | 11 ----------
 src/include/utils/guc_hooks.h       |  2 --
 5 files changed, 13 insertions(+), 47 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a00786d40c8..ffea4993177 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2215,25 +2215,6 @@ check_wal_segment_size(int *newval, void **extra, GucSource source)
 	return true;
 }
 
-/*
- * GUC check_hook for max_slot_wal_keep_size
- *
- * We don't allow the value of max_slot_wal_keep_size other than -1 during the
- * binary upgrade. See start_postmaster() in pg_upgrade for more details.
- */
-bool
-check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source)
-{
-	if (IsBinaryUpgrade && *newval != -1)
-	{
-		GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.",
-							"max_slot_wal_keep_size");
-		return false;
-	}
-
-	return true;
-}
-
 /*
  * At a checkpoint, how many WAL segments to recycle as preallocated future
  * XLOG segments? Returns the highest segment that should be preallocated.
@@ -7992,17 +7973,19 @@ KeepLogSeg(XLogRecPtr recptr, XLogRecPtr slotsMinReqLSN, XLogSegNo *logSegNo)
 	XLByteToSeg(recptr, currSegNo, wal_segment_size);
 	segno = currSegNo;
 
-	/*
-	 * Calculate how many segments are kept by slots first, adjusting for
-	 * max_slot_wal_keep_size.
-	 */
+	/* Calculate how many segments are kept by slots. */
 	keep = slotsMinReqLSN;
 	if (keep != InvalidXLogRecPtr && keep < recptr)
 	{
 		XLByteToSeg(keep, segno, wal_segment_size);
 
-		/* Cap by max_slot_wal_keep_size ... */
-		if (max_slot_wal_keep_size_mb >= 0)
+		/*
+		 * Account for max_slot_wal_keep_size to avoid keeping more than
+		 * configured.  However, don't do that during a binary upgrade: if
+		 * slots were to be invalidated because of this, it would not be
+		 * possible to preserve logical ones during the upgrade.
+		 */
+		if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade)
 		{
 			uint64		slot_keep_segs;
 
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index a1d4768623f..a234e2ca9c1 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1671,14 +1671,6 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
 
 		SpinLockRelease(&s->mutex);
 
-		/*
-		 * The logical replication slots shouldn't be invalidated as GUC
-		 * max_slot_wal_keep_size is set to -1 during the binary upgrade. See
-		 * check_old_cluster_for_valid_slots() where we ensure that no
-		 * invalidated before the upgrade.
-		 */
-		Assert(!(*invalidated && SlotIsLogical(s) && IsBinaryUpgrade));
-
 		if (active_pid != 0)
 		{
 			/*
@@ -1805,6 +1797,10 @@ restart:
 		if (!s->in_use)
 			continue;
 
+		/* Prevent invalidation of logical slots during binary upgrade. */
+		if (SlotIsLogical(s) && IsBinaryUpgrade)
+			continue;
+
 		if (InvalidatePossiblyObsoleteSlot(cause, s, oldestLSN, dboid,
 										   snapshotConflictHorizon,
 										   &invalidated))
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index c42fccf3fe7..a997dcb7dbc 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -2953,7 +2953,7 @@ struct config_int ConfigureNamesInt[] =
 		},
 		&max_slot_wal_keep_size_mb,
 		-1, -1, MAX_KILOBYTES,
-		check_max_slot_wal_keep_size, NULL, NULL
+		NULL, NULL, NULL
 	},
 
 	{
diff --git a/src/bin/pg_upgrade/server.c b/src/bin/pg_upgrade/server.c
index 91bcb4dbc7e..b223d5afddf 100644
--- a/src/bin/pg_upgrade/server.c
+++ b/src/bin/pg_upgrade/server.c
@@ -241,17 +241,6 @@ start_postmaster(ClusterInfo *cluster, bool report_and_exit_on_error)
 	if (cluster == &new_cluster)
 		appendPQExpBufferStr(&pgoptions, " -c synchronous_commit=off -c fsync=off -c full_page_writes=off");
 
-	/*
-	 * Use max_slot_wal_keep_size as -1 to prevent the WAL removal by the
-	 * checkpointer process.  If WALs required by logical replication slots
-	 * are removed, the slots are unusable.  This setting prevents the
-	 * invalidation of slots during the upgrade. We set this option when
-	 * cluster is PG17 or later because logical replication slots can only be
-	 * migrated since then. Besides, max_slot_wal_keep_size is added in PG13.
-	 */
-	if (GET_MAJOR_VERSION(cluster->major_version) >= 1700)
-		appendPQExpBufferStr(&pgoptions, " -c max_slot_wal_keep_size=-1");
-
 	/*
 	 * Use -b to disable autovacuum and logical replication launcher
 	 * (effective in PG17 or later for the latter).
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 8fd91af3887..1babff78bf3 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -86,8 +86,6 @@ extern bool check_maintenance_io_concurrency(int *newval, void **extra,
 extern void assign_maintenance_io_concurrency(int newval, void *extra);
 extern bool check_max_connections(int *newval, void **extra, GucSource source);
 extern bool check_max_wal_senders(int *newval, void **extra, GucSource source);
-extern bool check_max_slot_wal_keep_size(int *newval, void **extra,
-										 GucSource source);
 extern void assign_max_wal_size(int newval, void *extra);
 extern bool check_max_worker_processes(int *newval, void **extra,
 									   GucSource source);
-- 
2.50.0.727.gbf7dc18ff4-goog

