Dear Fujii Masao,

Thank you for the review of the patch. Based on your comments I propose
a new version of the patch.

> +#include "storage/buf_internals.h"
> This include should be placed in alphabetical order.

I removed the include of the header file. I'm not sure, that it is a good
way to expose buffer internals here. Instead, I implemented a new function
BufferGetRefCount to be used in standby.c.

> + uint32 buf_refcount = BUF_STATE_GET_REFCOUNT(buf_state);
> <snip>
> + if (buf_refcount > 1)
> + continue;
>
> Wouldn't it be better to check explicitly whether we're still the waiter,
> instead of using BUF_STATE_GET_REFCOUNT()?
>
>     (buf_state & BM_PIN_COUNT_WAITER) != 0 &&
>     bufHdr->wait_backend_pgprocno == MyProcNumber

This is a precondition for the ResolveRecoveryConflictWithBufferPin function
that the current process should be a waiter process. See LockBufferForCleanup
where this state is set before the call of ResolveRecoveryConflictWithBufferPin.
I believe, there is no sense to check the waiter state because it doesn't
change in this function. Instead, I think, it is enough to just check for
refcount. One of the suggestions is to add an assert to check that the current
process is a waiter process, but I'm not sure about it. Let me know please if
I missed anything.

> The current control flow in the loop feels a bit hard to follow.
> Would something like the following be simpler?

I reorganized the control flow as well.

I also added a new tap-test as a part of the patch. I did some changes in the
tap test to make it stable. Let me know, please, if it should be in a separate
commit.

With best regards,
Vitaly
From fc0b3b6df48d7c5d69edf891cd84f8c4848a0e65 Mon Sep 17 00:00:00 2001
From: Vitaly Davydov <[email protected]>
Date: Wed, 20 May 2026 11:48:34 +0300
Subject: [PATCH] Fix deadlock detector activation in a recovery conflict

When the startup process in a deadlock with a backend, it sends the
signal to the backend to trigger the deadlock detector when
the deadlock timeout is elapsed (deadlock_timeout guc). Due to some
optimization in timeout.c, when spontaneous SIGALRM signals are
possible, which doesn't relate to any enabled timeout, the function
ResolveRecoveryConflictWithBufferPin can never send the signal to the
conflicting backend, becase the deadlock timeout will never be
triggered.

The patch fixes ResolveRecoveryConflictWithBufferPin by ignoring
spontaneous SIGALRM signals, that are possible in the current
implementation of timeout.c functionality.
---
 src/backend/storage/buffer/bufmgr.c           |  25 +++-
 src/backend/storage/ipc/standby.c             |  67 +++++----
 src/include/storage/bufmgr.h                  |   1 +
 src/include/storage/standby.h                 |   2 +-
 .../t/053_startup_backend_deadlock.pl         | 127 ++++++++++++++++++
 5 files changed, 194 insertions(+), 28 deletions(-)
 create mode 100644 src/test/recovery/t/053_startup_backend_deadlock.pl

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cc398db124d..ce5871fbd9b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4749,6 +4749,29 @@ BufferGetLSNAtomic(Buffer buffer)
 #endif
 }
 
+/*
+ * BufferGetRefCount
+ *		Return the current reference counter.
+ */
+uint32
+BufferGetRefCount(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+	uint64		buf_state;
+	uint32		buf_refcount;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+
+	buf_state = LockBufHdr(bufHdr);
+	buf_refcount = BUF_STATE_GET_REFCOUNT(buf_state);
+	UnlockBufHdr(bufHdr);
+
+	return buf_refcount;
+}
+
 /* ---------------------------------------------------------------------
  *		DropRelationBuffers
  *
@@ -6789,7 +6812,7 @@ LockBufferForCleanup(Buffer buffer)
 			/* Publish the bufid that Startup process waits on */
 			SetStartupBufferPinWaitBufId(buffer - 1);
 			/* Set alarm and then wait to be signaled by UnpinBuffer() */
-			ResolveRecoveryConflictWithBufferPin();
+			ResolveRecoveryConflictWithBufferPin(buffer);
 			/* Reset the published bufid */
 			SetStartupBufferPinWaitBufId(-1);
 		}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index de9092fdf5b..00399eafee3 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -790,13 +790,17 @@ cleanup:
  * Deadlocks are extremely rare, and relatively expensive to check for,
  * so we don't do a deadlock check right away ... only if we have had to wait
  * at least deadlock_timeout.
+ *
+ * The precondition: the current process should be the waiter process.
  */
 void
-ResolveRecoveryConflictWithBufferPin(void)
+ResolveRecoveryConflictWithBufferPin(Buffer buffer)
 {
 	TimestampTz ltime;
 
 	Assert(InHotStandby);
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
 
 	ltime = GetStandbyLimitTime();
 
@@ -833,35 +837,46 @@ ResolveRecoveryConflictWithBufferPin(void)
 		enable_timeouts(timeouts, cnt);
 	}
 
-	/*
-	 * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
-	 * by one of the timeouts established above.
-	 *
-	 * We assume that only UnpinBuffer() and the timeout requests established
-	 * above can wake us up here. WakeupRecovery() called by walreceiver or
-	 * SIGHUP signal handler, etc cannot do that because it uses the different
-	 * latch from that ProcWaitForSignal() waits on.
-	 */
-	ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
-
-	if (got_standby_delay_timeout)
-		SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
-	else if (got_standby_deadlock_timeout)
+	for (;;)
 	{
 		/*
-		 * Send out a request for hot-standby backends to check themselves for
-		 * deadlocks.
+		 * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
+		 * by one of the timeouts established above.
 		 *
-		 * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
-		 * to be signaled by UnpinBuffer() again and send a request for
-		 * deadlocks check if deadlock_timeout happens. This causes the
-		 * request to continue to be sent every deadlock_timeout until the
-		 * buffer is unpinned or ltime is reached. This would increase the
-		 * workload in the startup process and backends. In practice it may
-		 * not be so harmful because the period that the buffer is kept pinned
-		 * is basically no so long. But we should fix this?
+		 * ProcWaitForSignal() can also wake up for unrelated reasons, so recheck
+		 * that the buffer is pinned by the current waiter process only (reference
+		 * counter should be 1). Continue waiting, if no registered timeout is
+		 * fired or the buffer is still pinned by other processes as well.
 		 */
-		SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
+		ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
+
+		/*
+		 * Once the reference count is less or equal to 1 and we are the waiter
+		 * process, no one uses the buffer at the moment. There is a chance to
+		 * lock the buffer exclusively.
+		 */
+		if (BufferGetRefCount(buffer) <= 1)
+			break;
+
+		if (got_standby_delay_timeout)
+			SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
+		else if (got_standby_deadlock_timeout)
+		{
+			/*
+			* Send out a request for hot-standby backends to check themselves for
+			* deadlocks.
+			*
+			* XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
+			* to be signaled by UnpinBuffer() again and send a request for
+			* deadlocks check if deadlock_timeout happens. This causes the
+			* request to continue to be sent every deadlock_timeout until the
+			* buffer is unpinned or ltime is reached. This would increase the
+			* workload in the startup process and backends. In practice it may
+			* not be so harmful because the period that the buffer is kept pinned
+			* is basically no so long. But we should fix this?
+			*/
+			SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
+		}
 	}
 
 	/*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 6837b35fc6d..b7dcff85a8d 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -313,6 +313,7 @@ extern bool BufferIsPermanent(Buffer buffer);
 extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
 extern void BufferGetTag(Buffer buffer, RelFileLocator *rlocator,
 						 ForkNumber *forknum, BlockNumber *blknum);
+uint32		BufferGetRefCount(Buffer buffer);
 
 extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
 
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index 6a314c693cd..c75d87b7ddc 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -79,7 +79,7 @@ extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
 extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
 
 extern void ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict);
-extern void ResolveRecoveryConflictWithBufferPin(void);
+extern void ResolveRecoveryConflictWithBufferPin(Buffer buffer);
 extern void CheckRecoveryConflictDeadlock(void);
 extern void StandbyDeadLockHandler(void);
 extern void StandbyTimeoutHandler(void);
diff --git a/src/test/recovery/t/053_startup_backend_deadlock.pl b/src/test/recovery/t/053_startup_backend_deadlock.pl
new file mode 100644
index 00000000000..5bd24f315e6
--- /dev/null
+++ b/src/test/recovery/t/053_startup_backend_deadlock.pl
@@ -0,0 +1,127 @@
+#!/usr/bin/perl
+#
+# Test a deadlock between a backend and the startup processes when processing
+# XLOG_PRUNE_PAGE wal record. The test is based on 031_recovery_conflict.pl
+# vanilla test.
+#
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Test settings
+my $deadlock_timeout = 3000; # ms
+my $log_startup_progress_interval = 2000; # ms
+my $testdb = "testdb";
+my $backup_name = 'my_backup';
+my $table1 = "table1";
+my $table2 = "table2";
+
+# Set up nodes
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+
+$node_primary->append_conf('postgresql.conf', qq[
+
+	# for deadlock test
+	max_prepared_transactions = 10
+
+	# wait some to test the wait paths as well, but not long for obvious reasons
+	max_standby_streaming_delay = -1
+
+	# Some of the recovery conflict logging code only gets exercised after
+	# deadlock_timeout. The test doesn't rely on that additional output, but it's
+	# nice to get some minimal coverage of that code.
+	log_recovery_conflict_waits = on
+
+	log_startup_progress_interval = ${log_startup_progress_interval}
+	deadlock_timeout = ${deadlock_timeout}
+	autovacuum = off
+]);
+
+$node_primary->start;
+$node_primary->backup($backup_name);
+
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1);
+$node_standby->start();
+
+my $log_location = -s $node_standby->logfile;
+
+# use a new database, to trigger database recovery conflict
+$node_primary->safe_psql('postgres', "CREATE DATABASE $testdb");
+#$node_primary->safe_psql('postgres', "CREATE TABLE $table2(a int, b int)");
+
+$node_primary->safe_psql($testdb, qq[
+	CREATE TABLE $table1(a int, b int);
+	CREATE TABLE $table2(a int, b int);
+	INSERT INTO $table1 VALUES (1);
+]);
+
+# Generate a few dead rows, to later be cleaned up by vacuum. Then acquire a
+# lock on another relation in a prepared xact, so it's held continuously by
+# the startup process. The standby psql will block acquiring that lock while
+# holding a pin that vacuum needs, triggering the deadlock.
+$node_primary->safe_psql($testdb, qq[
+	BEGIN;
+	INSERT INTO $table1(a) SELECT generate_series(1, 100) i;
+	ROLLBACK;
+]);
+
+$node_primary->safe_psql($testdb, qq[
+	BEGIN;
+	LOCK TABLE $table2;
+	PREPARE TRANSACTION 'lock';
+	INSERT INTO $table1(a) VALUES (170);
+	SELECT txid_current();
+]);
+
+$node_primary->wait_for_catchup($node_standby, 'replay', $node_primary->lsn('write'));
+
+my $psql_standby = $node_standby->background_psql($testdb, on_error_stop => 0);
+
+$psql_standby->query_until(qr/^1$/m, qq[
+	BEGIN;
+	-- hold pin
+	DECLARE test_recovery_conflict_cursor CURSOR FOR SELECT a FROM $table1;
+	FETCH FORWARD FROM test_recovery_conflict_cursor;
+	-- wait for lock held by prepared transaction
+	SELECT * FROM $table2;
+]);
+
+ok(1, "cursor holding conflicting pin, also waiting for lock, established");
+
+# VACUUM will prune away rows, causing a buffer pin conflict, while standby
+# psql is waiting on lock
+$node_primary->safe_psql($testdb, qq[VACUUM $table1;]);
+
+# Wait and check that the deadlock detector was triggered and found a deadlock
+# in the backend process (not in startup process).
+note("Waiting for deadlock detector to launch...");
+check_conflict_log("User transaction caused buffer deadlock with recovery.");
+
+# clean up for next tests
+$node_primary->safe_psql($testdb, qq[ROLLBACK PREPARED 'lock';]);
+
+# Check that expected number of conflicts show in pg_stat_database. Needs to
+# be tested before database is dropped, for obvious reasons.
+my $nconflicts = $node_standby->safe_psql($testdb, "SELECT conflicts FROM pg_stat_database WHERE datname = '$testdb'");
+note("number of recovery conflicts: $nconflicts");
+
+$psql_standby->quit();
+
+sub check_conflict_log
+{
+	my $message = shift;
+	my $old_log_location = $log_location;
+
+	$log_location = $node_standby->wait_for_log(qr/$message/, $log_location);
+
+	cmp_ok($log_location, '>', $old_log_location,
+		"logfile contains terminated connection due to recovery conflict"
+	);
+}
+
+done_testing();
-- 
2.43.0

Reply via email to