Here's a new version of this. Notable changes:

- I reverted the changes to ExtendMultiXactOffset(), so that it deals with wraparound and FirstMultiXactId the same way as before. The caller never passes FirstMultiXactId, but the changed comments and the assertion were confusing, so I felt it's best to just leave it alone

- bunch of comment changes & other cosmetic changes

- I modified TrimMultiXact() to initialize the page corresponding to 'nextMulti', because if you just swapped the binary to the new one, and nextMulti was at a page boundary, it would not be initialized yet.

If we want to backpatch this, and I think we need to because this fixes real bugs, we need to think through all the upgrade scenarios. I made the above-mentioned changes to TrimMultiXact(), but it doesn't fix all the problems.

What happens if you replay the WAL generated with old binary, without this patch, with new binary? It's not good:

LOG: database system was not properly shut down; automatic recovery in progress
LOG:  redo starts at 0/01766A68
FATAL:  could not access status of transaction 2048
DETAIL: Could not read from file "pg_multixact/offsets/0000" at offset 8192: read too few bytes. CONTEXT: WAL redo at 0/05561030 for MultiXact/CREATE_ID: 2047 offset 4093 nmembers 2: 2830 (keysh) 2831 (keysh)
LOG:  startup process (PID 3130184) exited with exit code 1

This is because the WAL, created with old version, contains records like this:

lsn: 0/05561030, prev 0/05561008, desc: CREATE_ID 2047 offset 4093 nmembers 2: 2830 (keysh) 2831 (keysh)
lsn: 0/055611A8, prev 0/05561180, desc: ZERO_OFF_PAGE 1
lsn: 0/055611D0, prev 0/055611A8, desc: CREATE_ID 2048 offset 4095 nmembers 2: 2831 (keysh) 2832 (keysh)


When replaying that with the new version, replay of the CREATE_ID 2047 record tries to set the next multixid's offset, but the page hasn't been initialized yet. With the new version, the ZERO_OFF_PAGE 1 record would appear before the CREATE_ID 2047 record, but we can't change the WAL that already exists.

- Heikki
From e84d56b24d034694822710bca971eb7ac3a5a930 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Wed, 26 Nov 2025 18:07:01 +0200
Subject: [PATCH v11 1/1] Avoid multixact edge case 2 by writing the next
 offset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RecordNewMultiXact now writes the offset of the
following multixact so readers never block waiting for “case 2”.
The new TAP test reproduces IPC/MultixactCreation hangs and verifies
that previously recorded multis stay readable across crash recovery.

Author: Andrey Borodin <[email protected]>
Reviewed-by: Dmitry Yurichev <[email protected]>
Reviewed-by: Álvaro Herrera <[email protected]>
Reviewed-by: Kirill Reshke <[email protected]>
Reviewed-by: Ivan Bykov <[email protected]>
---
 src/backend/access/transam/multixact.c        | 150 +++++++++++-------
 src/test/modules/test_slru/t/001_multixact.pl | 141 ++++++++--------
 src/test/modules/test_slru/test_multixact.c   |   1 -
 src/test/perl/PostgreSQL/Test/Cluster.pm      |  40 +++++
 src/test/recovery/t/017_shm.pl                |  38 +----
 5 files changed, 201 insertions(+), 169 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 9d5f130af7e..d1e15ff5624 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -79,7 +79,6 @@
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
-#include "storage/condition_variable.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -271,12 +270,6 @@ typedef struct MultiXactStateData
 	/* support for members anti-wraparound measures */
 	MultiXactOffset offsetStopLimit;	/* known if oldestOffsetKnown */
 
-	/*
-	 * This is used to sleep until a multixact offset is written when we want
-	 * to create the next one.
-	 */
-	ConditionVariable nextoff_cv;
-
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
 	 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -912,13 +905,33 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	int			entryno;
 	int			slotno;
 	MultiXactOffset *offptr;
-	int			i;
+	MultiXactId next;
+	int64		next_pageno;
+	int			next_entryno;
+	MultiXactOffset *next_offptr;
 	LWLock	   *lock;
 	LWLock	   *prevlock = NULL;
 
+	/* position of this multixid in the offsets SLRU area  */
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
+	/* position of the next multixid */
+	next = multi + 1;
+	if (next < FirstMultiXactId)
+		next = FirstMultiXactId;
+	next_pageno = MultiXactIdToOffsetPage(next);
+	next_entryno = MultiXactIdToOffsetEntry(next);
+
+	/*
+	 * Set the starting offset of this multixid's members.
+	 *
+	 * In the common case, it was already be set by the previous
+	 * RecordNewMultiXact call, as this was the next multixid of the previous
+	 * multixid.  But if multiple backends are generating multixids
+	 * concurrently, we might race ahead and get called before previous
+	 * multixid.
+	 */
 	lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
 	LWLockAcquire(lock, LW_EXCLUSIVE);
 
@@ -933,22 +946,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 	offptr += entryno;
 
-	*offptr = offset;
+	if (*offptr != offset)
+	{
+		/* should already be set to the correct value, or not at all */
+		Assert(*offptr == 0);
+		*offptr = offset;
+		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+	}
 
-	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+	/*
+	 * Set the next multixid's offset to the end of this multixid's members.
+	 */
+	if (next_pageno == pageno)
+	{
+		next_offptr = offptr + 1;
+	}
+	else
+	{
+		/* must be the first entry on the page */
+		Assert(next_entryno == 0 || next == FirstMultiXactId);
+
+		/* Swap the lock for a lock on the next page */
+		LWLockRelease(lock);
+		lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+
+		slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
+		next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+		next_offptr += next_entryno;
+	}
+
+	if (*next_offptr != offset + nmembers)
+	{
+		/* should already be set to the correct value, or not at all */
+		Assert(*next_offptr == 0);
+		*next_offptr = offset + nmembers;
+		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+	}
 
 	/* Release MultiXactOffset SLRU lock. */
 	LWLockRelease(lock);
 
-	/*
-	 * If anybody was waiting to know the offset of this multixact ID we just
-	 * wrote, they can read it now, so wake them up.
-	 */
-	ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
-
 	prev_pageno = -1;
 
-	for (i = 0; i < nmembers; i++, offset++)
+	for (int i = 0; i < nmembers; i++, offset++)
 	{
 		TransactionId *memberptr;
 		uint32	   *flagsptr;
@@ -1138,8 +1179,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 			result = FirstMultiXactId;
 	}
 
-	/* Make sure there is room for the MXID in the file.  */
-	ExtendMultiXactOffset(result);
+	/*
+	 * Make sure there is room for the next MXID in the file.  Assigning this
+	 * MXID sets the next MXID offset already.
+	 */
+	ExtendMultiXactOffset(result + 1);
 
 	/*
 	 * Reserve the members space, similarly to above.  Also, be careful not to
@@ -1304,7 +1348,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	MultiXactOffset nextOffset;
 	MultiXactMember *ptr;
 	LWLock	   *lock;
-	bool		slept = false;
 
 	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
 
@@ -1381,23 +1424,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * one's.  However, there are some corner cases to worry about:
 	 *
 	 * 1. This multixact may be the latest one created, in which case there is
-	 * no next one to look at.  In this case the nextOffset value we just
-	 * saved is the correct endpoint.
+	 * no next one to look at.  The next multixact's offset should be set
+	 * already, as we set it in RecordNewMultiXact(), but we used to not do
+	 * that in older minor versions.  To cope with that case, if this
+	 * multixact is the latest one created, use the nextOffset value we read
+	 * above as the endpoint.
 	 *
-	 * 2. The next multixact may still be in process of being filled in: that
-	 * is, another process may have done GetNewMultiXactId but not yet written
-	 * the offset entry for that ID.  In that scenario, it is guaranteed that
-	 * the offset entry for that multixact exists (because GetNewMultiXactId
-	 * won't release MultiXactGenLock until it does) but contains zero
-	 * (because we are careful to pre-zero offset pages). Because
-	 * GetNewMultiXactId will never return zero as the starting offset for a
-	 * multixact, when we read zero as the next multixact's offset, we know we
-	 * have this case.  We handle this by sleeping on the condition variable
-	 * we have just for this; the process in charge will signal the CV as soon
-	 * as it has finished writing the multixact offset.
-	 *
-	 * 3. Because GetNewMultiXactId increments offset zero to offset one to
-	 * handle case #2, there is an ambiguity near the point of offset
+	 * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
+	 * for to mean "unset", there is an ambiguity near the point of offset
 	 * wraparound.  If we see next multixact's offset is one, is that our
 	 * multixact's actual endpoint, or did it end at zero with a subsequent
 	 * increment?  We handle this using the knowledge that if the zero'th
@@ -1409,7 +1443,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * cases, so it seems better than holding the MultiXactGenLock for a long
 	 * time on every multixact creation.
 	 */
-retry:
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
@@ -1473,16 +1506,10 @@ retry:
 
 		if (nextMXOffset == 0)
 		{
-			/* Corner case 2: next multixact is still being filled in */
-			LWLockRelease(lock);
-			CHECK_FOR_INTERRUPTS();
-
-			INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
-
-			ConditionVariableSleep(&MultiXactState->nextoff_cv,
-								   WAIT_EVENT_MULTIXACT_CREATION);
-			slept = true;
-			goto retry;
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("MultiXact %d has invalid next offset",
+							multi)));
 		}
 
 		length = nextMXOffset - offset;
@@ -1491,12 +1518,6 @@ retry:
 	LWLockRelease(lock);
 	lock = NULL;
 
-	/*
-	 * If we slept above, clean up state; it's no longer needed.
-	 */
-	if (slept)
-		ConditionVariableCancelSleep();
-
 	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
 
 	truelength = 0;
@@ -1539,7 +1560,7 @@ retry:
 
 		if (!TransactionIdIsValid(*xactptr))
 		{
-			/* Corner case 3: we must be looking at unused slot zero */
+			/* Corner case 2: we must be looking at unused slot zero */
 			Assert(offset == 0);
 			continue;
 		}
@@ -1986,7 +2007,6 @@ MultiXactShmemInit(void)
 
 		/* Make sure we zero out the per-backend state */
 		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
-		ConditionVariableInit(&MultiXactState->nextoff_cv);
 	}
 	else
 		Assert(found);
@@ -2132,26 +2152,36 @@ TrimMultiXact(void)
 						pageno);
 
 	/*
+	 * Set the offset of the last multixact on the offsets page.
+	 *
+	 * This is normally done in RecordNewMultiXact() of the previous
+	 * multixact, but we used to not do that in older minor versions.  To
+	 * ensure that the next offset is set if the binary was just upgraded from
+	 * an older minor version, do it now.
+	 *
 	 * Zero out the remainder of the current offsets page.  See notes in
 	 * TrimCLOG() for background.  Unlike CLOG, some WAL record covers every
 	 * pg_multixact SLRU mutation.  Since, also unlike CLOG, we ignore the WAL
 	 * rule "write xlog before data," nextMXact successors may carry obsolete,
-	 * nonzero offset values.  Zero those so case 2 of GetMultiXactIdMembers()
-	 * operates normally.
+	 * nonzero offset values.
 	 */
 	entryno = MultiXactIdToOffsetEntry(nextMXact);
-	if (entryno != 0)
 	{
 		int			slotno;
 		MultiXactOffset *offptr;
 		LWLock	   *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
 
 		LWLockAcquire(lock, LW_EXCLUSIVE);
-		slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
+		if (entryno == 0)
+			slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+		else
+			slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
 		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 		offptr += entryno;
 
-		MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+		*offptr = offset;
+		if ((entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
+			MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
 
 		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
 		LWLockRelease(lock);
diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl
index e2b567a603d..2f85802f920 100644
--- a/src/test/modules/test_slru/t/001_multixact.pl
+++ b/src/test/modules/test_slru/t/001_multixact.pl
@@ -18,6 +18,10 @@ if ($ENV{enable_injection_points} ne 'yes')
 {
 	plan skip_all => 'Injection points not supported by this build';
 }
+if ($windows_os)
+{
+	plan skip_all => 'Kill9 works unpredicatably on Windows';
+}
 
 my ($node, $result);
 
@@ -25,96 +29,87 @@ $node = PostgreSQL::Test::Cluster->new('mike');
 $node->init;
 $node->append_conf('postgresql.conf',
 	"shared_preload_libraries = 'test_slru,injection_points'");
+# Set the cluster's next multitransaction to 0xFFFFFFF0.
+my $node_pgdata = $node->data_dir;
+command_ok(
+	[
+		'pg_resetwal',
+		'--multixact-ids' => '0xFFFFFFF0,0xFFFFFFF0',
+		$node_pgdata
+	],
+	"set the cluster's next multitransaction to 0xFFFFFFF0");
+command_ok(
+	[
+		'dd',
+		'if=/dev/zero',
+		"of=$node_pgdata/pg_multixact/offsets/FFFF",
+		'bs=4',
+		'count=65536'
+	],
+	"init SLRU file");
+
+command_ok(
+	[
+		'rm',
+		"$node_pgdata/pg_multixact/offsets/0000",
+	],
+	"drop old SLRU file");
+
 $node->start;
 $node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
 $node->safe_psql('postgres', q(CREATE EXTENSION test_slru));
 
-# Test for Multixact generation edge case
-$node->safe_psql('postgres',
-	q{select injection_points_attach('test-multixact-read','wait')});
-$node->safe_psql('postgres',
-	q{select injection_points_attach('multixact-get-members-cv-sleep','wait')}
-);
+# Another multixact test: loosing some multixact must not affect reading near
+# multixacts, even after a crash.
+my $bg_psql = $node->background_psql('postgres');
 
-# This session must observe sleep on the condition variable while generating a
-# multixact.  To achieve this it first will create a multixact, then pause
-# before reading it.
-my $observer = $node->background_psql('postgres');
-
-# This query will create a multixact, and hang just before reading it.
-$observer->query_until(
-	qr/start/,
-	q{
-	\echo start
-	SELECT test_read_multixact(test_create_multixact());
-});
-$node->wait_for_event('client backend', 'test-multixact-read');
-
-# This session will create the next Multixact. This is necessary to avoid
-# multixact.c's non-sleeping edge case 1.
-my $creator = $node->background_psql('postgres');
+my $multi = $bg_psql->query_safe(
+	q(SELECT test_create_multixact();));
+
+# The space for next multi will be allocated, but it will never be actually
+# recorded.
 $node->safe_psql('postgres',
 	q{SELECT injection_points_attach('multixact-create-from-members','wait');}
 );
 
-# We expect this query to hang in the critical section after generating new
-# multixact, but before filling its offset into SLRU.
-# Running an injection point inside a critical section requires it to be
-# loaded beforehand.
-$creator->query_until(
-	qr/start/, q{
-	\echo start
+$bg_psql->query_until(
+	qr/deploying lost multi/, q(
+\echo deploying lost multi
 	SELECT test_create_multixact();
-});
+));
 
 $node->wait_for_event('client backend', 'multixact-create-from-members');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, test-multixact-read',
-	"matching injection point waits");
-
-# Now wake observer to get it to read the initial multixact.  A subsequent
-# multixact already exists, but that one doesn't have an offset assigned, so
-# this will hit multixact.c's edge case 2.
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('test-multixact-read')});
-$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, multixact-get-members-cv-sleep',
-	"matching injection point waits");
-
-# Now we have two backends waiting in multixact-create-from-members and
-# multixact-get-members-cv-sleep.  Also we have 3 injections points set to wait.
-# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must
-# detach it first. So let's detach all injection points, then wake up all
-# backends.
-
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('test-multixact-read')});
 $node->safe_psql('postgres',
 	q{SELECT injection_points_detach('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('multixact-get-members-cv-sleep')});
 
 $node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')});
+	q{checkpoint;});
 
-# Background psql will now be able to read the result and disconnect.
-$observer->quit;
-$creator->quit;
+# One more multitransaction to effectivelt emit WAL record about next
+# multitransaction (to avaoid corener case 1).
+$node->safe_psql('postgres',
+	q{SELECT test_create_multixact();});
+
+# All set and done, it's time for hard restart
+$node->kill9;
+$node->stop('immediate', fail_ok => 1);
+$node->poll_start;
+$bg_psql->{run}->finish;
+
+# Verify thet recorded multi is readble, this call must not hang.
+# Also note that all injection points disappeared after server restart.
+my $timed_out = 0;
+$node->safe_psql(
+	'postgres',
+	qq{SELECT test_read_multixact('$multi'::xid);},
+	timeout => $PostgreSQL::Test::Utils::timeout_default,
+	timed_out => \$timed_out);
+ok($timed_out == 0, 'recorded multi is readble');
+
+# Test mxidwraparound
+foreach my $i (1 .. 32) {
+$node->safe_psql('postgres',q{SELECT test_create_multixact();});
+}
 
 $node->stop;
 
diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c
index 6c9b0420717..2fd67273ee5 100644
--- a/src/test/modules/test_slru/test_multixact.c
+++ b/src/test/modules/test_slru/test_multixact.c
@@ -46,7 +46,6 @@ test_read_multixact(PG_FUNCTION_ARGS)
 	MultiXactId id = PG_GETARG_TRANSACTIONID(0);
 	MultiXactMember *members;
 
-	INJECTION_POINT("test-multixact-read", NULL);
 	/* discard caches */
 	AtEOXact_MultiXact();
 
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 35413f14019..e810f123f93 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -1191,6 +1191,46 @@ sub start
 
 =pod
 
+=item $node->poll_start() => success_or_failure
+
+Polls start after kill9
+
+We may need retries to start a new postmaster.  Causes:
+ - kernel is slow to deliver SIGKILL
+ - postmaster parent is slow to waitpid()
+ - postmaster child is slow to exit in response to SIGQUIT
+ - postmaster child is slow to exit after postmaster death
+
+=cut
+
+sub poll_start
+{
+	my ($self) = @_;
+
+	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
+	my $attempts = 0;
+
+	while ($attempts < $max_attempts)
+	{
+		$self->start(fail_ok => 1) && return 1;
+
+		# Wait 0.1 second before retrying.
+		usleep(100_000);
+
+		# Clean up in case the start attempt just timed out or some such.
+		$self->stop('fast', fail_ok => 1);
+
+		$attempts++;
+	}
+
+	# Try one last time without fail_ok, which will BAIL_OUT unless it
+	# succeeds.
+	$self->start && return 1;
+	return 0;
+}
+
+=pod
+
 =item $node->kill9()
 
 Send SIGKILL (signal 9) to the postmaster.
diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl
index c73aa3f0c2c..ac238544217 100644
--- a/src/test/recovery/t/017_shm.pl
+++ b/src/test/recovery/t/017_shm.pl
@@ -67,7 +67,7 @@ log_ipcs();
 # Upon postmaster death, postmaster children exit automatically.
 $gnat->kill9;
 log_ipcs();
-poll_start($gnat);    # gnat recycles its former shm key.
+$gnat->poll_start;    # gnat recycles its former shm key.
 log_ipcs();
 
 note "removing the conflicting shmem ...";
@@ -82,7 +82,7 @@ log_ipcs();
 # the higher-keyed segment that the previous postmaster was using.
 # That's not great, but key collisions should be rare enough to not
 # make this a big problem.
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 $gnat->stop;
 log_ipcs();
@@ -174,43 +174,11 @@ $slow_client->finish;    # client has detected backend termination
 log_ipcs();
 
 # now startup should work
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 
 # finish testing
 $gnat->stop;
 log_ipcs();
 
-
-# We may need retries to start a new postmaster.  Causes:
-# - kernel is slow to deliver SIGKILL
-# - postmaster parent is slow to waitpid()
-# - postmaster child is slow to exit in response to SIGQUIT
-# - postmaster child is slow to exit after postmaster death
-sub poll_start
-{
-	my ($node) = @_;
-
-	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
-	my $attempts = 0;
-
-	while ($attempts < $max_attempts)
-	{
-		$node->start(fail_ok => 1) && return 1;
-
-		# Wait 0.1 second before retrying.
-		usleep(100_000);
-
-		# Clean up in case the start attempt just timed out or some such.
-		$node->stop('fast', fail_ok => 1);
-
-		$attempts++;
-	}
-
-	# Try one last time without fail_ok, which will BAIL_OUT unless it
-	# succeeds.
-	$node->start && return 1;
-	return 0;
-}
-
 done_testing();
-- 
2.47.3

Reply via email to