Re: refactoring relation extension and BufferAlloc(), faster COPY

Andres Freund Mon, 09 Jan 2023 18:08:18 -0800

Hi,

On 2023-01-06 11:52:04 +0530, vignesh C wrote:
> On Sat, 29 Oct 2022 at 08:24, Andres Freund <and...@anarazel.de> wrote:
> >
> > The patches here aren't fully polished (as will be evident). But they should
> > be more than good enough to discuss whether this is a sane direction.
> 
> The patch does not apply on top of HEAD as in [1], please post a rebased
> patch.


Thanks for letting me now. Updated version attached.

Greetings,

Andres Freund

>From dc67e1ff43e550a2ff6a0181995f2f12bbb2a423 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 1 Jul 2020 19:06:45 -0700
Subject: [PATCH v2 01/14] aio: Add some error checking around pinning.

---
 src/include/storage/bufmgr.h        |  1 +
 src/backend/storage/buffer/bufmgr.c | 42 ++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 33eadbc1291..3becf32a3c0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -129,6 +129,7 @@ extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
 extern void IncrBufferRefCount(Buffer buffer);
+extern void BufferCheckOneLocalPin(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 								   BlockNumber blockNum);
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 3fb38a25cfa..bfaf141edd7 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1707,6 +1707,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 	bool		result;
 	PrivateRefCountEntry *ref;
 
+	Assert(!BufferIsLocal(b));
+
 	ref = GetPrivateRefCountEntry(b, true);
 
 	if (ref == NULL)
@@ -1852,6 +1854,8 @@ UnpinBuffer(BufferDesc *buf)
 	PrivateRefCountEntry *ref;
 	Buffer		b = BufferDescriptorGetBuffer(buf);
 
+	Assert(!BufferIsLocal(b));
+
 	/* not moving as we're likely deleting it soon anyway */
 	ref = GetPrivateRefCountEntry(b, false);
 	Assert(ref != NULL);
@@ -4209,6 +4213,25 @@ ConditionalLockBuffer(Buffer buffer)
 									LW_EXCLUSIVE);
 }
 
+void
+BufferCheckOneLocalPin(Buffer buffer)
+{
+	if (BufferIsLocal(buffer))
+	{
+		/* There should be exactly one pin */
+		if (LocalRefCount[-buffer - 1] != 1)
+			elog(ERROR, "incorrect local pin count: %d",
+				 LocalRefCount[-buffer - 1]);
+	}
+	else
+	{
+		/* There should be exactly one local pin */
+		if (GetPrivateRefCount(buffer) != 1)
+			elog(ERROR, "incorrect local pin count: %d",
+				 GetPrivateRefCount(buffer));
+	}
+}
+
 /*
  * LockBufferForCleanup - lock a buffer in preparation for deleting items
  *
@@ -4236,20 +4259,11 @@ LockBufferForCleanup(Buffer buffer)
 	Assert(BufferIsPinned(buffer));
 	Assert(PinCountWaitBuf == NULL);
 
-	if (BufferIsLocal(buffer))
-	{
-		/* There should be exactly one pin */
-		if (LocalRefCount[-buffer - 1] != 1)
-			elog(ERROR, "incorrect local pin count: %d",
-				 LocalRefCount[-buffer - 1]);
-		/* Nobody else to wait for */
-		return;
-	}
+	BufferCheckOneLocalPin(buffer);
 
-	/* There should be exactly one local pin */
-	if (GetPrivateRefCount(buffer) != 1)
-		elog(ERROR, "incorrect local pin count: %d",
-			 GetPrivateRefCount(buffer));
+	/* Nobody else to wait for */
+	if (BufferIsLocal(buffer))
+		return;
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
 
@@ -4757,6 +4771,8 @@ LockBufHdr(BufferDesc *desc)
 	SpinDelayStatus delayStatus;
 	uint32		old_buf_state;
 
+	Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
+
 	init_local_spin_delay(&delayStatus);
 
 	while (true)
-- 
2.38.0

>From bb6a65580687d8bb932e2dc26c32e72025d34354 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 24 Oct 2022 12:28:06 -0700
Subject: [PATCH v2 02/14] hio: Release extension lock before initializing page
 / pinning VM

PageInit() while holding the extension lock is unnecessary after 0d1fe9f74e3
started to use RBM_ZERO_AND_LOCK - nobody can look at the new page before we
release the page lock. PageInit() zeroes the page, which isn't that cheap, so
deferring it until after the extension lock is released seems like a good idea.

Doing visibilitymap_pin() while holding the extension lock, introduced in
7db0cd2145f2, looks like an accident. Due to the restrictions on
HEAP_INSERT_FROZEN it's unlikely to be a performance issue, but it still seems
better to move it out.
---
 src/backend/access/heap/hio.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index e152807d2dc..7479212d4e0 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -623,6 +623,13 @@ loop:
 	 */
 	buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
 
+	/*
+	 * Release the file-extension lock; it's now OK for someone else to extend
+	 * the relation some more.
+	 */
+	if (needLock)
+		UnlockRelationForExtension(relation, ExclusiveLock);
+
 	/*
 	 * We need to initialize the empty new page.  Double-check that it really
 	 * is empty (this should never happen, but if it does we don't want to
@@ -647,13 +654,6 @@ loop:
 		visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer);
 	}
 
-	/*
-	 * Release the file-extension lock; it's now OK for someone else to extend
-	 * the relation some more.
-	 */
-	if (needLock)
-		UnlockRelationForExtension(relation, ExclusiveLock);
-
 	/*
 	 * Lock the other buffer. It's guaranteed to be of a lower page number
 	 * than the new page. To conform with the deadlock prevent rules, we ought
-- 
2.38.0

>From 097a56759a7d4ac8352d95b4f23ec98e96bb394f Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sun, 23 Oct 2022 14:25:46 -0700
Subject: [PATCH v2 03/14] Add smgrzeroextend(), FileZero(), FileFallocate()

smgrzeroextend() uses FileFallocate() to efficiently extend files by multiple
blocks. When extending by a small number of blocks, use FileZero() instead, as
using posix_fallocate() for small numbers of blocks is inefficient for some
file systems / operating systems. FileZero() is also used as the fallback for
FileFallocate() on platforms / filesystems that don't support fallocate.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/fd.h        |   3 +
 src/include/storage/md.h        |   2 +
 src/include/storage/smgr.h      |   2 +
 src/backend/storage/file/fd.c   | 105 ++++++++++++++++++++++++++++++++
 src/backend/storage/smgr/md.c   | 103 +++++++++++++++++++++++++++++++
 src/backend/storage/smgr/smgr.c |  21 +++++++
 6 files changed, 236 insertions(+)

diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index f85de97d083..2c9453aa3f0 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -106,6 +106,9 @@ extern int	FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event
 extern int	FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
 extern int	FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
 extern int	FileSync(File file, uint32 wait_event_info);
+extern int	FileZero(File file, off_t offset, off_t len, uint32 wait_event_info);
+extern int	FileFallocate(File file, off_t offset, off_t len, uint32 wait_event_info);
+
 extern off_t FileSize(File file);
 extern int	FileTruncate(File file, off_t offset, uint32 wait_event_info);
 extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index bcada9ff221..67afd14d7b0 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -28,6 +28,8 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
 extern void mdextend(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 56233c4d216..a5806029ce1 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -92,6 +92,8 @@ extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 926d000f2ea..afd05e48cc0 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -93,6 +93,7 @@
 #include "common/pg_prng.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/pg_iovec.h"
 #include "portability/mem.h"
 #include "postmaster/startup.h"
 #include "storage/fd.h"
@@ -2205,6 +2206,105 @@ FileSync(File file, uint32 wait_event_info)
 	return returnCode;
 }
 
+/* So that FileZero() doesn't have to re-zero a block on every call */
+static const PGAlignedBlock zerobuf = {0};
+
+int
+FileZero(File file, off_t offset, off_t len, uint32 wait_event_info)
+{
+	int			returnCode;
+	int			numblocks;
+	struct iovec iov[PG_IOV_MAX];
+
+	/*
+	 * FIXME: Quick-and-dirty implementation, to be replaced by
+	 * pg_pwrite_zeros() from
+	 * https://postgr.es/m/Y1oc%2BFjiyVjNZa%2BL%40paquier.xyz
+	 *
+	 * Otherwise it'd not at all be ok to rely on len being a multiple of
+	 * BLCKSZ.
+	 */
+	Assert((len % BLCKSZ) == 0);
+
+	Assert(FileIsValid(file));
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	numblocks = len / BLCKSZ;
+
+	for (int i = 0; i < Min(numblocks, lengthof(iov)); ++i)
+	{
+		iov[i].iov_base = (char *) zerobuf.data;
+		iov[i].iov_len = BLCKSZ;
+	}
+
+	while (numblocks > 0)
+	{
+		int			iovcnt = Min(numblocks, lengthof(iov));
+		off_t		seekpos_l = offset;
+		ssize_t		ret;
+
+		pgstat_report_wait_start(wait_event_info);
+		ret = pg_pwritev_with_retry(VfdCache[file].fd, iov, iovcnt, seekpos_l);
+		pgstat_report_wait_end();
+
+		if (ret < 0)
+			return -1;
+
+		Assert(ret == iovcnt * BLCKSZ);
+		offset += iovcnt * BLCKSZ;
+		numblocks -= iovcnt;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ */
+int
+FileFallocate(File file, off_t offset, off_t len, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+#ifdef HAVE_POSIX_FALLOCATE
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = posix_fallocate(VfdCache[file].fd, offset, len);
+	pgstat_report_wait_end();
+
+	if (returnCode == 0)
+		return 0;
+
+	/* for compatibility with %m printing etc */
+	errno = returnCode;
+
+	/*
+	 * Return in cases of a "real" failure, if fallocate is not supported,
+	 * fall through to the FileZero() backed implementation.
+	 */
+	if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+		return returnCode;
+
+	if (returnCode == 0 ||
+		(returnCode != EINVAL && returnCode != EINVAL))
+		return returnCode;
+#endif
+
+	return FileZero(file, offset, len, wait_event_info);
+}
+
 off_t
 FileSize(File file)
 {
@@ -2277,6 +2377,11 @@ int
 FileGetRawDesc(File file)
 {
 	Assert(FileIsValid(file));
+
+	if (FileAccess(file) < 0)
+		return -1;
+
+	FileAccess(file);
 	return VfdCache[file].fd;
 }
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 60c9905eff9..2197670f4b0 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -28,6 +28,7 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "commands/tablespace.h"
+#include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "pgstat.h"
@@ -500,6 +501,108 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 }
 
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+			 BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+	MdfdVec    *v;
+	BlockNumber curblocknum = blocknum;
+	int         remblocks = nblocks;
+
+	Assert(nblocks > 0);
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber or larger.
+	 */
+	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rlocator, forknum),
+						InvalidBlockNumber)));
+
+	while (remblocks > 0)
+	{
+		int			segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+		int			segendblock = (curblocknum % ((BlockNumber) RELSEG_SIZE)) + remblocks;
+		off_t       seekpos = (off_t) BLCKSZ * segstartblock;
+		int			numblocks;
+
+		if (segendblock > RELSEG_SIZE)
+			segendblock = RELSEG_SIZE;
+
+		numblocks = segendblock - segstartblock;
+
+		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+		Assert(segstartblock < RELSEG_SIZE);
+		Assert(segendblock <= RELSEG_SIZE);
+
+		/*
+		 * If available use posix_fallocate() to extend the relation. That's
+		 * often more efficient than using write(), as it commonly won't cause
+		 * the kernel to allocate page cache space for the extended pages.
+		 *
+		 * However, we shouldn't use fallocate() for small extensions, it
+		 * defeats delayed allocation on some filesystems. Not clear where
+		 * that decision should be made though? For now just use a cutoff of
+		 * 8, anything between 4 and 8 worked OK in some local testing.
+		 */
+		if (numblocks > 8)
+		{
+			int         ret;
+
+			ret = FileFallocate(v->mdfd_vfd, seekpos,
+								(off_t) BLCKSZ * numblocks,
+								WAIT_EVENT_DATA_FILE_EXTEND);
+			if (ret != 0)
+			{
+				ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not extend file \"%s\" with posix_fallocate(): %m",
+									FilePathName(v->mdfd_vfd)),
+							 errhint("Check free disk space.")));
+			}
+		}
+		else
+		{
+			int         ret;
+
+			/*
+			 * Even if we don't have fallocate, we can still extend a bit more
+			 * efficiently than writing each 8kB block individually.
+			 * FileZero() uses pg_writev[with_retry] with a single zeroed
+			 * buffer to avoid needing a zeroed buffer for the whole length of
+			 * the extension.
+			 */
+			ret = FileZero(v->mdfd_vfd, seekpos,
+						   (off_t) BLCKSZ * numblocks,
+						   WAIT_EVENT_DATA_FILE_EXTEND);
+			if (ret < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not extend file \"%s\": %m",
+								FilePathName(v->mdfd_vfd)),
+						 errhint("Check free disk space.")));
+		}
+
+		if (!skipFsync && !SmgrIsTemp(reln))
+			register_dirty_segment(reln, forknum, v);
+
+		Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+		remblocks -= segendblock - segstartblock;
+		curblocknum += segendblock - segstartblock;
+	}
+}
+
 /*
  *	mdopenfork() -- Open one fork of the specified relation.
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 80eb6311e74..2c0d26eabe0 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -50,6 +50,8 @@ typedef struct f_smgr
 								bool isRedo);
 	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
 								BlockNumber blocknum, char *buffer, bool skipFsync);
+	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber blocknum);
 	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
@@ -75,6 +77,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_exists = mdexists,
 		.smgr_unlink = mdunlink,
 		.smgr_extend = mdextend,
+		.smgr_zeroextend = mdzeroextend,
 		.smgr_prefetch = mdprefetch,
 		.smgr_read = mdread,
 		.smgr_write = mdwrite,
@@ -507,6 +510,24 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 }
 
+void
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			   int nblocks, bool skipFsync)
+{
+	smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+											 nblocks, skipFsync);
+
+	/*
+	 * Normally we expect this to increase nblocks by nblocks, but if the
+	 * cached value isn't as expected, just invalidate it so the next call
+	 * asks the kernel.
+	 */
+	if (reln->smgr_cached_nblocks[forknum] == blocknum)
+		reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+	else
+		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+}
+
 /*
  *	smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
  *
-- 
2.38.0

>From a9189ec0e28855b0e3bf02f1dfbe820227910b4d Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 26 Oct 2022 12:05:07 -0700
Subject: [PATCH v2 04/14] bufmgr: Add Pin/UnpinLocalBuffer()

So far these were open-coded in quite a few places, without a good reason.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/buf_internals.h   |  2 +
 src/backend/storage/buffer/bufmgr.c   | 30 +++----------
 src/backend/storage/buffer/localbuf.c | 62 +++++++++++++++++----------
 3 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index ed8aa2519c0..4b1aeb5fd25 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -413,6 +413,8 @@ extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
 
 /* localbuf.c */
+extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
+extern void UnpinLocalBuffer(Buffer buffer);
 extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
 												ForkNumber forkNum,
 												BlockNumber blockNum);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index bfaf141edd7..678e390ab4d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -644,20 +644,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 		/* Is it still valid and holding the right tag? */
 		if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
 		{
-			/*
-			 * Bump buffer's ref and usage counts. This is equivalent of
-			 * PinBuffer for a shared buffer.
-			 */
-			if (LocalRefCount[b] == 0)
-			{
-				if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
-				{
-					buf_state += BUF_USAGECOUNT_ONE;
-					pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
-				}
-			}
-			LocalRefCount[b]++;
-			ResourceOwnerRememberBuffer(CurrentResourceOwner, recent_buffer);
+			PinLocalBuffer(bufHdr, true);
 
 			pgBufferUsage.local_blks_hit++;
 
@@ -1660,8 +1647,7 @@ ReleaseAndReadBuffer(Buffer buffer,
 				BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
 				BufTagGetForkNum(&bufHdr->tag) == forkNum)
 				return buffer;
-			ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
-			LocalRefCount[-buffer - 1]--;
+			UnpinLocalBuffer(buffer);
 		}
 		else
 		{
@@ -3938,15 +3924,9 @@ ReleaseBuffer(Buffer buffer)
 		elog(ERROR, "bad buffer ID: %d", buffer);
 
 	if (BufferIsLocal(buffer))
-	{
-		ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
-
-		Assert(LocalRefCount[-buffer - 1] > 0);
-		LocalRefCount[-buffer - 1]--;
-		return;
-	}
-
-	UnpinBuffer(GetBufferDescriptor(buffer - 1));
+		UnpinLocalBuffer(buffer);
+	else
+		UnpinBuffer(GetBufferDescriptor(buffer - 1));
 }
 
 /*
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index b2720df6eaa..7b6294deef3 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -136,27 +136,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
 				smgr->smgr_rlocator.locator.relNumber, forkNum, blockNum, -b - 1);
 #endif
-		buf_state = pg_atomic_read_u32(&bufHdr->state);
 
-		/* this part is equivalent to PinBuffer for a shared buffer */
-		if (LocalRefCount[b] == 0)
-		{
-			if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
-			{
-				buf_state += BUF_USAGECOUNT_ONE;
-				pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
-			}
-		}
-		LocalRefCount[b]++;
-		ResourceOwnerRememberBuffer(CurrentResourceOwner,
-									BufferDescriptorGetBuffer(bufHdr));
-		if (buf_state & BM_VALID)
-			*foundPtr = true;
-		else
-		{
-			/* Previous read attempt must have failed; try again */
-			*foundPtr = false;
-		}
+		*foundPtr = PinLocalBuffer(bufHdr, true);
 		return bufHdr;
 	}
 
@@ -193,9 +174,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 			else
 			{
 				/* Found a usable buffer */
-				LocalRefCount[b]++;
-				ResourceOwnerRememberBuffer(CurrentResourceOwner,
-											BufferDescriptorGetBuffer(bufHdr));
+				PinLocalBuffer(bufHdr, false);
 				break;
 			}
 		}
@@ -483,6 +462,43 @@ InitLocalBuffers(void)
 	NLocBuffer = nbufs;
 }
 
+bool
+PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
+{
+	uint32		buf_state;
+	Buffer		buffer = BufferDescriptorGetBuffer(buf_hdr);
+	int			bufid = -(buffer + 1);
+
+	buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+	if (LocalRefCount[bufid] == 0)
+	{
+		if (adjust_usagecount &&
+			BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+		{
+			buf_state += BUF_USAGECOUNT_ONE;
+			pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+		}
+	}
+	LocalRefCount[bufid]++;
+	ResourceOwnerRememberBuffer(CurrentResourceOwner,
+								BufferDescriptorGetBuffer(buf_hdr));
+
+	return buf_state & BM_VALID;
+}
+
+void
+UnpinLocalBuffer(Buffer buffer)
+{
+	int buffid = -buffer - 1;
+
+	Assert(BufferIsLocal(buffer));
+	Assert(LocalRefCount[buffid] > 0);
+
+	ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
+	LocalRefCount[buffid]--;
+}
+
 /*
  * GUC check_hook for temp_buffers
  */
-- 
2.38.0

>From e0fb77e2a162385bea4a9a6ce7e31935d76fd161 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sun, 23 Oct 2022 14:29:57 -0700
Subject: [PATCH v2 05/14] bufmgr: Acquire and clean victim buffer separately

Previously we held buffer locks for two buffer mapping partitions at the same
time to change the identity of buffers. Particularly for extending relations
needing to hold the extension lock while acquiring a victim buffer is
painful. By separating out the victim buffer acquisition, future commits will
be able to change relation extensions to scale better.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/storage/buffer/bufmgr.c   | 570 ++++++++++++++------------
 src/backend/storage/buffer/localbuf.c | 115 +++---
 2 files changed, 381 insertions(+), 304 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 678e390ab4d..b9af8a05989 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -482,6 +482,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
 							   BlockNumber blockNum,
 							   BufferAccessStrategy strategy,
 							   bool *foundPtr);
+static Buffer GetVictimBuffer(BufferAccessStrategy strategy);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
@@ -1111,14 +1112,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	BufferTag	newTag;			/* identity of requested block */
 	uint32		newHash;		/* hash value for newTag */
 	LWLock	   *newPartitionLock;	/* buffer partition lock for it */
-	BufferTag	oldTag;			/* previous identity of selected buffer */
-	uint32		oldHash;		/* hash value for oldTag */
-	LWLock	   *oldPartitionLock;	/* buffer partition lock for it */
-	uint32		oldFlags;
-	int			buf_id;
-	BufferDesc *buf;
-	bool		valid;
-	uint32		buf_state;
+	int			existing_buf_id;
+
+	Buffer		victim_buffer;
+	BufferDesc *victim_buf_hdr;
+	uint32		victim_buf_state;
 
 	/* create a tag so we can lookup the buffer */
 	InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
@@ -1129,15 +1127,18 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	/* see if the block is in the buffer pool already */
 	LWLockAcquire(newPartitionLock, LW_SHARED);
-	buf_id = BufTableLookup(&newTag, newHash);
-	if (buf_id >= 0)
+	existing_buf_id = BufTableLookup(&newTag, newHash);
+	if (existing_buf_id >= 0)
 	{
+		BufferDesc *buf;
+		bool		valid;
+
 		/*
 		 * Found it.  Now, pin the buffer so no one can steal it from the
 		 * buffer pool, and check to see if the correct data has been loaded
 		 * into the buffer.
 		 */
-		buf = GetBufferDescriptor(buf_id);
+		buf = GetBufferDescriptor(existing_buf_id);
 
 		valid = PinBuffer(buf, strategy);
 
@@ -1174,266 +1175,96 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 */
 	LWLockRelease(newPartitionLock);
 
-	/* Loop here in case we have to try another victim buffer */
-	for (;;)
+	/*
+	 * Acquire a victim buffer. Somebody else might try to do the same, we
+	 * don't hold any conflicting locks. If so we'll have to undo our work
+	 * later.
+	 */
+	victim_buffer = GetVictimBuffer(strategy);
+	victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
+
+	/*
+	 * Try to make a hashtable entry for the buffer under its new tag. If
+	 * somebody else inserted another buffer for the tag, we'll release the
+	 * victim buffer we acquired and use the already inserted one.
+	 */
+	LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+	existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
+	if (existing_buf_id >= 0)
 	{
-		/*
-		 * Ensure, while the spinlock's not yet held, that there's a free
-		 * refcount entry.
-		 */
-		ReservePrivateRefCountEntry();
+		BufferDesc *existing_buf_hdr;
+		bool		valid;
 
 		/*
-		 * Select a victim buffer.  The buffer is returned with its header
-		 * spinlock still held!
+		 * Got a collision. Someone has already done what we were about to
+		 * do. We'll just handle this as if it were found in the buffer pool
+		 * in the first place.  First, give up the buffer we were planning to
+		 * use.
+		 *
+		 * We could do this after releasing the partition lock, but then we'd
+		 * have to call ResourceOwnerEnlargeBuffers() &
+		 * ReservePrivateRefCountEntry() before acquiring the lock, for the
+		 * rare case of such a collision.
 		 */
-		buf = StrategyGetBuffer(strategy, &buf_state);
+		UnpinBuffer(victim_buf_hdr);
 
-		Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
+		/* FIXME: Should we put the victim buffer onto the freelist? */
 
-		/* Must copy buffer flags while we still hold the spinlock */
-		oldFlags = buf_state & BUF_FLAG_MASK;
+		/* remaining code should match code at top of routine */
 
-		/* Pin the buffer and then release the buffer spinlock */
-		PinBuffer_Locked(buf);
+		existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
 
-		/*
-		 * If the buffer was dirty, try to write it out.  There is a race
-		 * condition here, in that someone might dirty it after we released it
-		 * above, or even while we are writing it out (since our share-lock
-		 * won't prevent hint-bit updates).  We will recheck the dirty bit
-		 * after re-locking the buffer header.
-		 */
-		if (oldFlags & BM_DIRTY)
-		{
-			/*
-			 * We need a share-lock on the buffer contents to write it out
-			 * (else we might write invalid data, eg because someone else is
-			 * compacting the page contents while we write).  We must use a
-			 * conditional lock acquisition here to avoid deadlock.  Even
-			 * though the buffer was not pinned (and therefore surely not
-			 * locked) when StrategyGetBuffer returned it, someone else could
-			 * have pinned and exclusive-locked it by the time we get here. If
-			 * we try to get the lock unconditionally, we'd block waiting for
-			 * them; if they later block waiting for us, deadlock ensues.
-			 * (This has been observed to happen when two backends are both
-			 * trying to split btree index pages, and the second one just
-			 * happens to be trying to split the page the first one got from
-			 * StrategyGetBuffer.)
-			 */
-			if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
-										 LW_SHARED))
-			{
-				/*
-				 * If using a nondefault strategy, and writing the buffer
-				 * would require a WAL flush, let the strategy decide whether
-				 * to go ahead and write/reuse the buffer or to choose another
-				 * victim.  We need lock to inspect the page LSN, so this
-				 * can't be done inside StrategyGetBuffer.
-				 */
-				if (strategy != NULL)
-				{
-					XLogRecPtr	lsn;
+		valid = PinBuffer(existing_buf_hdr, strategy);
 
-					/* Read the LSN while holding buffer header lock */
-					buf_state = LockBufHdr(buf);
-					lsn = BufferGetLSN(buf);
-					UnlockBufHdr(buf, buf_state);
-
-					if (XLogNeedsFlush(lsn) &&
-						StrategyRejectBuffer(strategy, buf))
-					{
-						/* Drop lock/pin and loop around for another buffer */
-						LWLockRelease(BufferDescriptorGetContentLock(buf));
-						UnpinBuffer(buf);
-						continue;
-					}
-				}
-
-				/* OK, do the I/O */
-				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
-														  smgr->smgr_rlocator.locator.spcOid,
-														  smgr->smgr_rlocator.locator.dbOid,
-														  smgr->smgr_rlocator.locator.relNumber);
-
-				FlushBuffer(buf, NULL);
-				LWLockRelease(BufferDescriptorGetContentLock(buf));
-
-				ScheduleBufferTagForWriteback(&BackendWritebackContext,
-											  &buf->tag);
-
-				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
-														 smgr->smgr_rlocator.locator.spcOid,
-														 smgr->smgr_rlocator.locator.dbOid,
-														 smgr->smgr_rlocator.locator.relNumber);
-			}
-			else
-			{
-				/*
-				 * Someone else has locked the buffer, so give it up and loop
-				 * back to get another one.
-				 */
-				UnpinBuffer(buf);
-				continue;
-			}
-		}
-
-		/*
-		 * To change the association of a valid buffer, we'll need to have
-		 * exclusive lock on both the old and new mapping partitions.
-		 */
-		if (oldFlags & BM_TAG_VALID)
-		{
-			/*
-			 * Need to compute the old tag's hashcode and partition lock ID.
-			 * XXX is it worth storing the hashcode in BufferDesc so we need
-			 * not recompute it here?  Probably not.
-			 */
-			oldTag = buf->tag;
-			oldHash = BufTableHashCode(&oldTag);
-			oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-			/*
-			 * Must lock the lower-numbered partition first to avoid
-			 * deadlocks.
-			 */
-			if (oldPartitionLock < newPartitionLock)
-			{
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-			else if (oldPartitionLock > newPartitionLock)
-			{
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-			}
-			else
-			{
-				/* only one partition, only one lock */
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-		}
-		else
-		{
-			/* if it wasn't valid, we need only the new partition */
-			LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			/* remember we have no old-partition lock or tag */
-			oldPartitionLock = NULL;
-			/* keep the compiler quiet about uninitialized variables */
-			oldHash = 0;
-		}
-
-		/*
-		 * Try to make a hashtable entry for the buffer under its new tag.
-		 * This could fail because while we were writing someone else
-		 * allocated another buffer for the same block we want to read in.
-		 * Note that we have not yet removed the hashtable entry for the old
-		 * tag.
-		 */
-		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
-
-		if (buf_id >= 0)
-		{
-			/*
-			 * Got a collision. Someone has already done what we were about to
-			 * do. We'll just handle this as if it were found in the buffer
-			 * pool in the first place.  First, give up the buffer we were
-			 * planning to use.
-			 */
-			UnpinBuffer(buf);
-
-			/* Can give up that buffer's mapping partition lock now */
-			if (oldPartitionLock != NULL &&
-				oldPartitionLock != newPartitionLock)
-				LWLockRelease(oldPartitionLock);
-
-			/* remaining code should match code at top of routine */
-
-			buf = GetBufferDescriptor(buf_id);
-
-			valid = PinBuffer(buf, strategy);
-
-			/* Can release the mapping lock as soon as we've pinned it */
-			LWLockRelease(newPartitionLock);
-
-			*foundPtr = true;
-
-			if (!valid)
-			{
-				/*
-				 * We can only get here if (a) someone else is still reading
-				 * in the page, or (b) a previous read attempt failed.  We
-				 * have to wait for any active read attempt to finish, and
-				 * then set up our own read attempt if the page is still not
-				 * BM_VALID.  StartBufferIO does it all.
-				 */
-				if (StartBufferIO(buf, true))
-				{
-					/*
-					 * If we get here, previous attempts to read the buffer
-					 * must have failed ... but we shall bravely try again.
-					 */
-					*foundPtr = false;
-				}
-			}
-
-			return buf;
-		}
-
-		/*
-		 * Need to lock the buffer header too in order to change its tag.
-		 */
-		buf_state = LockBufHdr(buf);
-
-		/*
-		 * Somebody could have pinned or re-dirtied the buffer while we were
-		 * doing the I/O and making the new hashtable entry.  If so, we can't
-		 * recycle this buffer; we must undo everything we've done and start
-		 * over with a new victim buffer.
-		 */
-		oldFlags = buf_state & BUF_FLAG_MASK;
-		if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
-			break;
-
-		UnlockBufHdr(buf, buf_state);
-		BufTableDelete(&newTag, newHash);
-		if (oldPartitionLock != NULL &&
-			oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
+		/* Can release the mapping lock as soon as we've pinned it */
 		LWLockRelease(newPartitionLock);
-		UnpinBuffer(buf);
+
+		*foundPtr = true;
+
+		if (!valid)
+		{
+			/*
+			 * We can only get here if (a) someone else is still reading
+			 * in the page, or (b) a previous read attempt failed.  We
+			 * have to wait for any active read attempt to finish, and
+			 * then set up our own read attempt if the page is still not
+			 * BM_VALID.  StartBufferIO does it all.
+			 */
+			if (StartBufferIO(existing_buf_hdr, true))
+			{
+				/*
+				 * If we get here, previous attempts to read the buffer
+				 * must have failed ... but we shall bravely try again.
+				 */
+				*foundPtr = false;
+			}
+		}
+
+		return existing_buf_hdr;
 	}
 
 	/*
-	 * Okay, it's finally safe to rename the buffer.
-	 *
-	 * Clearing BM_VALID here is necessary, clearing the dirtybits is just
-	 * paranoia.  We also reset the usage_count since any recency of use of
-	 * the old content is no longer relevant.  (The usage_count starts out at
-	 * 1 so that the buffer can survive one clock-sweep pass.)
-	 *
+	 * Need to lock the buffer header too in order to change its tag.
+	 */
+	victim_buf_state = LockBufHdr(victim_buf_hdr);
+
+	/* some sanity checks while we hold the buffer header lock */
+	Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
+	Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
+
+	victim_buf_hdr->tag = newTag;
+
+	/*
 	 * Make sure BM_PERMANENT is set for buffers that must be written at every
 	 * checkpoint.  Unlogged buffers only need to be written at shutdown
 	 * checkpoints, except for their "init" forks, which need to be treated
 	 * just like permanent relations.
 	 */
-	buf->tag = newTag;
-	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
-				   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
-				   BUF_USAGECOUNT_MASK);
+	victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
 	if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
-		buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
-	else
-		buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+		victim_buf_state |= BM_PERMANENT;
 
-	UnlockBufHdr(buf, buf_state);
-
-	if (oldPartitionLock != NULL)
-	{
-		BufTableDelete(&oldTag, oldHash);
-		if (oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-	}
+	UnlockBufHdr(victim_buf_hdr, victim_buf_state);
 
 	LWLockRelease(newPartitionLock);
 
@@ -1443,12 +1274,12 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * to read it before we did, so there's nothing left for BufferAlloc() to
 	 * do.
 	 */
-	if (StartBufferIO(buf, true))
+	if (StartBufferIO(victim_buf_hdr, true))
 		*foundPtr = false;
 	else
 		*foundPtr = true;
 
-	return buf;
+	return victim_buf_hdr;
 }
 
 /*
@@ -1557,6 +1388,239 @@ retry:
 	StrategyFreeBuffer(buf);
 }
 
+/*
+ * Helper routine for GetVictimBuffer()
+ *
+ * Needs to be called with the buffer pinned, but without the buffer header
+ * spinlock held.
+ *
+ * Returns true if the buffer can be reused, in which case the buffer is only
+ * pinned by this backend and marked as invalid, false otherwise.
+ */
+static bool
+InvalidateVictimBuffer(BufferDesc *buf_hdr)
+{
+	uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+	Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+	Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
+
+	/* can't change while we're holding the pin */
+	if (buf_state & BM_TAG_VALID)
+	{
+		uint32      hash;
+		LWLock     *partition_lock;
+		BufferTag   tag;
+
+		/* have buffer pinned, so it's safe to read tag without lock */
+		tag = buf_hdr->tag;
+
+		hash = BufTableHashCode(&tag);
+		partition_lock = BufMappingPartitionLock(hash);
+
+		LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+		/* lock the buffer header */
+		buf_state = LockBufHdr(buf_hdr);
+
+		Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
+
+		/*
+		 * We have the buffer pinned nobody else should have been able to
+		 * unset this concurrently.
+		 */
+		Assert(buf_state & BM_TAG_VALID);
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+		/*
+		 * If somebody else pinned the buffer since, or even worse, dirtied it,
+		 * give up on this buffer: It's clearly in use.
+		 */
+		if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
+		{
+			Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+			UnlockBufHdr(buf_hdr, buf_state);
+			LWLockRelease(partition_lock);
+
+			return false;
+		}
+
+		/*
+		 * Clear out the buffer's tag and flags and usagecount.  We must do
+		 * this to ensure that linear scans of the buffer array don't think
+		 * the buffer is valid.
+		 *
+		 * XXX: This is a pre-existing comment I just moved, but isn't it
+		 * entirely bogus with regard to the tag? We can't do anything with
+		 * the buffer without taking BM_VALID / BM_TAG_VALID into
+		 * account. Likely doesn't matter because we're already dirtying the
+		 * cacheline, but still.
+		 *
+		 */
+		ClearBufferTag(&buf_hdr->tag);
+		buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+		UnlockBufHdr(buf_hdr, buf_state);
+
+		Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+		BufTableDelete(&tag, hash);
+
+		LWLockRelease(partition_lock);
+	}
+
+	Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
+	Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+	Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
+
+	return true;
+}
+
+static Buffer
+GetVictimBuffer(BufferAccessStrategy strategy)
+{
+	Buffer cur_buf;
+	BufferDesc *cur_buf_hdr = NULL;
+	uint32 cur_buf_state;
+
+	/*
+	 * Ensure, while the spinlock's not yet held, that there's a free
+	 * refcount entry.
+	 */
+	ReservePrivateRefCountEntry();
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+again:
+
+	/*
+	 * Select a victim buffer.  The buffer is returned with its header
+	 * spinlock still held!
+	 */
+	cur_buf_hdr = StrategyGetBuffer(strategy, &cur_buf_state);
+	cur_buf = BufferDescriptorGetBuffer(cur_buf_hdr);
+
+	Assert(BUF_STATE_GET_REFCOUNT(cur_buf_state) == 0);
+
+	/* Pin the buffer and then release the buffer spinlock */
+	PinBuffer_Locked(cur_buf_hdr);
+
+	/*
+	 * We shouldn't have any other pins for this buffer.
+	 */
+	BufferCheckOneLocalPin(cur_buf);
+
+	/*
+	 * If the buffer was dirty, try to write it out.  There is a race
+	 * condition here, in that someone might dirty it after we released the
+	 * buffer header lock above, or even while we are writing it out (since
+	 * our share-lock won't prevent hint-bit updates).  We will recheck the
+	 * dirty bit after re-locking the buffer header.
+	 */
+	if (cur_buf_state & BM_DIRTY)
+	{
+		LWLock *content_lock;
+
+		Assert(cur_buf_state & BM_TAG_VALID);
+		Assert(cur_buf_state & BM_VALID);
+
+		/*
+		 * We need a share-lock on the buffer contents to write it out
+		 * (else we might write invalid data, eg because someone else is
+		 * compacting the page contents while we write).  We must use a
+		 * conditional lock acquisition here to avoid deadlock.  Even
+		 * though the buffer was not pinned (and therefore surely not
+		 * locked) when StrategyGetBuffer returned it, someone else could
+		 * have pinned and exclusive-locked it by the time we get here. If
+		 * we try to get the lock unconditionally, we'd block waiting for
+		 * them; if they later block waiting for us, deadlock ensues.
+		 * (This has been observed to happen when two backends are both
+		 * trying to split btree index pages, and the second one just
+		 * happens to be trying to split the page the first one got from
+		 * StrategyGetBuffer.)
+		 */
+		content_lock = BufferDescriptorGetContentLock(cur_buf_hdr);
+		if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+		{
+			/*
+			 * Someone else has locked the buffer, so give it up and loop
+			 * back to get another one.
+			 */
+			UnpinBuffer(cur_buf_hdr);
+			goto again;
+		}
+
+		/*
+		 * If using a nondefault strategy, and writing the buffer would
+		 * require a WAL flush, let the strategy decide whether to go ahead
+		 * and write/reuse the buffer or to choose another victim.  We need
+		 * lock to inspect the page LSN, so this can't be done inside
+		 * StrategyGetBuffer.
+		 */
+		if (strategy != NULL)
+		{
+			XLogRecPtr	lsn;
+
+			/* Read the LSN while holding buffer header lock */
+			cur_buf_state = LockBufHdr(cur_buf_hdr);
+			lsn = BufferGetLSN(cur_buf_hdr);
+			UnlockBufHdr(cur_buf_hdr, cur_buf_state);
+
+			if (XLogNeedsFlush(lsn)
+				&& StrategyRejectBuffer(strategy, cur_buf_hdr))
+			{
+				LWLockRelease(content_lock);
+				UnpinBuffer(cur_buf_hdr);
+				goto again;
+			}
+		}
+
+		/* OK, do the I/O */
+		/* FIXME: These used the wrong smgr before afaict? */
+		{
+			SMgrRelation smgr = smgropen(BufTagGetRelFileLocator(&cur_buf_hdr->tag),
+										 InvalidBackendId);
+
+			TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(cur_buf_hdr->tag.forkNum,
+													  cur_buf_hdr->tag.blockNum,
+													  smgr->smgr_rlocator.locator.spcOid,
+													  smgr->smgr_rlocator.locator.dbOid,
+													  smgr->smgr_rlocator.locator.relNumber);
+
+			FlushBuffer(cur_buf_hdr, smgr);
+			LWLockRelease(content_lock);
+
+			ScheduleBufferTagForWriteback(&BackendWritebackContext,
+										  &cur_buf_hdr->tag);
+
+			TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(cur_buf_hdr->tag.forkNum,
+													 cur_buf_hdr->tag.blockNum,
+													 smgr->smgr_rlocator.locator.spcOid,
+													 smgr->smgr_rlocator.locator.dbOid,
+													 smgr->smgr_rlocator.locator.relNumber);
+		}
+	}
+
+	/*
+	 * If the buffer has an entry in the buffer mapping table, delete it. This
+	 * can fail because another backend could have pinned or dirtied the
+	 * buffer.
+	 */
+	if (!InvalidateVictimBuffer(cur_buf_hdr))
+	{
+		UnpinBuffer(cur_buf_hdr);
+		goto again;
+	}
+
+	/* a final set of sanity checks */
+	cur_buf_state = pg_atomic_read_u32(&cur_buf_hdr->state);
+
+	Assert(BUF_STATE_GET_REFCOUNT(cur_buf_state) == 1);
+	Assert(!(cur_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
+
+	BufferCheckOneLocalPin(cur_buf);
+
+	return cur_buf;
+}
 /*
  * MarkBufferDirty
  *
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 7b6294deef3..b1d0c309918 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -44,13 +44,14 @@ BufferDesc *LocalBufferDescriptors = NULL;
 Block	   *LocalBufferBlockPointers = NULL;
 int32	   *LocalRefCount = NULL;
 
-static int	nextFreeLocalBuf = 0;
+static int	nextFreeLocalBufId = 0;
 
 static HTAB *LocalBufHash = NULL;
 
 
 static void InitLocalBuffers(void);
 static Block GetLocalBufferStorage(void);
+static Buffer GetLocalVictimBuffer(void);
 
 
 /*
@@ -112,10 +113,9 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	BufferTag	newTag;			/* identity of requested block */
 	LocalBufferLookupEnt *hresult;
 	BufferDesc *bufHdr;
-	int			b;
-	int			trycounter;
+	Buffer		victim_buffer;
+	int			bufid;
 	bool		found;
-	uint32		buf_state;
 
 	InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
 
@@ -129,23 +129,51 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 	if (hresult)
 	{
-		b = hresult->id;
-		bufHdr = GetLocalBufferDescriptor(b);
+		bufid = hresult->id;
+		bufHdr = GetLocalBufferDescriptor(bufid);
 		Assert(BufferTagsEqual(&bufHdr->tag, &newTag));
-#ifdef LBDEBUG
-		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
-				smgr->smgr_rlocator.locator.relNumber, forkNum, blockNum, -b - 1);
-#endif
 
 		*foundPtr = PinLocalBuffer(bufHdr, true);
-		return bufHdr;
+	}
+	else
+	{
+		uint32		buf_state;
+
+		victim_buffer = GetLocalVictimBuffer();
+		bufid = -(victim_buffer + 1);
+		bufHdr = GetLocalBufferDescriptor(bufid);
+
+		hresult = (LocalBufferLookupEnt *)
+			hash_search(LocalBufHash, (void *) &newTag, HASH_ENTER, &found);
+		if (found)					/* shouldn't happen */
+			elog(ERROR, "local buffer hash table corrupted");
+		hresult->id = bufid;
+
+		/*
+		 * it's all ours now.
+		 */
+		bufHdr->tag = newTag;
+
+		buf_state = pg_atomic_read_u32(&bufHdr->state);
+		buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+		buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+		*foundPtr = false;
 	}
 
-#ifdef LBDEBUG
-	fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
-			smgr->smgr_rlocator.locator.relNumber, forkNum, blockNum,
-			-nextFreeLocalBuf - 1);
-#endif
+	return bufHdr;
+}
+
+static Buffer
+GetLocalVictimBuffer(void)
+{
+	int			victim_bufid;
+	int			trycounter;
+	uint32		buf_state;
+	BufferDesc *bufHdr;
+
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
 	/*
 	 * Need to get a new buffer.  We use a clock sweep algorithm (essentially
@@ -154,14 +182,14 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	trycounter = NLocBuffer;
 	for (;;)
 	{
-		b = nextFreeLocalBuf;
+		victim_bufid = nextFreeLocalBufId;
 
-		if (++nextFreeLocalBuf >= NLocBuffer)
-			nextFreeLocalBuf = 0;
+		if (++nextFreeLocalBufId >= NLocBuffer)
+			nextFreeLocalBufId = 0;
 
-		bufHdr = GetLocalBufferDescriptor(b);
+		bufHdr = GetLocalBufferDescriptor(victim_bufid);
 
-		if (LocalRefCount[b] == 0)
+		if (LocalRefCount[victim_bufid] == 0)
 		{
 			buf_state = pg_atomic_read_u32(&bufHdr->state);
 
@@ -184,6 +212,15 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 					 errmsg("no empty local buffer available")));
 	}
 
+	/*
+	 * lazy memory allocation: allocate space on first use of a buffer.
+	 */
+	if (LocalBufHdrGetBlock(bufHdr) == NULL)
+	{
+		/* Set pointer for use by BufferGetBlock() macro */
+		LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage();
+	}
+
 	/*
 	 * this buffer is not referenced but it might still be dirty. if that's
 	 * the case, write it out before reusing it!
@@ -213,19 +250,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	}
 
 	/*
-	 * lazy memory allocation: allocate space on first use of a buffer.
-	 */
-	if (LocalBufHdrGetBlock(bufHdr) == NULL)
-	{
-		/* Set pointer for use by BufferGetBlock() macro */
-		LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage();
-	}
-
-	/*
-	 * Update the hash table: remove old entry, if any, and make new one.
+	 * Remove the victim buffer from the hashtable and mark as invalid.
 	 */
 	if (buf_state & BM_TAG_VALID)
 	{
+		LocalBufferLookupEnt *hresult;
+
 		hresult = (LocalBufferLookupEnt *)
 			hash_search(LocalBufHash, (void *) &bufHdr->tag,
 						HASH_REMOVE, NULL);
@@ -233,28 +263,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 			elog(ERROR, "local buffer hash table corrupted");
 		/* mark buffer invalid just in case hash insert fails */
 		ClearBufferTag(&bufHdr->tag);
-		buf_state &= ~(BM_VALID | BM_TAG_VALID);
+		buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
 		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 	}
 
-	hresult = (LocalBufferLookupEnt *)
-		hash_search(LocalBufHash, (void *) &newTag, HASH_ENTER, &found);
-	if (found)					/* shouldn't happen */
-		elog(ERROR, "local buffer hash table corrupted");
-	hresult->id = b;
-
-	/*
-	 * it's all ours now.
-	 */
-	bufHdr->tag = newTag;
-	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
-	buf_state |= BM_TAG_VALID;
-	buf_state &= ~BUF_USAGECOUNT_MASK;
-	buf_state += BUF_USAGECOUNT_ONE;
-	pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
-
-	*foundPtr = false;
-	return bufHdr;
+	return BufferDescriptorGetBuffer(bufHdr);
 }
 
 /*
@@ -423,7 +436,7 @@ InitLocalBuffers(void)
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of memory")));
 
-	nextFreeLocalBuf = 0;
+	nextFreeLocalBufId = 0;
 
 	/* initialize fields that need to start off nonzero */
 	for (i = 0; i < nbufs; i++)
-- 
2.38.0

>From 523335a6ef6b959d4aa1a666d1a76ba133b8b3c2 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 24 Oct 2022 16:44:16 -0700
Subject: [PATCH v2 06/14] bufmgr: Support multiple in-progress IOs by using
 resowner

---
 src/include/storage/bufmgr.h          |  2 +-
 src/include/utils/resowner_private.h  |  5 ++
 src/backend/access/transam/xact.c     |  4 +-
 src/backend/postmaster/autovacuum.c   |  1 -
 src/backend/postmaster/bgwriter.c     |  1 -
 src/backend/postmaster/checkpointer.c |  1 -
 src/backend/postmaster/walwriter.c    |  1 -
 src/backend/storage/buffer/bufmgr.c   | 86 ++++++++++++---------------
 src/backend/utils/resowner/resowner.c | 60 +++++++++++++++++++
 9 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 3becf32a3c0..2e1d7540fd0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -176,7 +176,7 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer);
 extern bool IsBufferCleanupOK(Buffer buffer);
 extern bool HoldingBufferPinThatDelaysRecovery(void);
 
-extern void AbortBufferIO(void);
+extern void AbortBufferIO(Buffer buffer);
 
 extern void BufmgrCommit(void);
 extern bool BgBufferSync(struct WritebackContext *wb_context);
diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h
index 1b1f3181b54..ae58438ec76 100644
--- a/src/include/utils/resowner_private.h
+++ b/src/include/utils/resowner_private.h
@@ -30,6 +30,11 @@ extern void ResourceOwnerEnlargeBuffers(ResourceOwner owner);
 extern void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer);
 extern void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer);
 
+/* support for IO-in-progress management */
+extern void ResourceOwnerEnlargeBufferIOs(ResourceOwner owner);
+extern void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer);
+extern void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer);
+
 /* support for local lock management */
 extern void ResourceOwnerRememberLock(ResourceOwner owner, LOCALLOCK *locallock);
 extern void ResourceOwnerForgetLock(ResourceOwner owner, LOCALLOCK *locallock);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index d85e3139082..2a91ed64a7a 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2725,8 +2725,7 @@ AbortTransaction(void)
 	pgstat_report_wait_end();
 	pgstat_progress_end_command();
 
-	/* Clean up buffer I/O and buffer context locks, too */
-	AbortBufferIO();
+	/* Clean up buffer context locks, too */
 	UnlockBuffers();
 
 	/* Reset WAL record construction state */
@@ -5086,7 +5085,6 @@ AbortSubTransaction(void)
 
 	pgstat_report_wait_end();
 	pgstat_progress_end_command();
-	AbortBufferIO();
 	UnlockBuffers();
 
 	/* Reset WAL record construction state */
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index f5ea381c53e..2d5d1855439 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -526,7 +526,6 @@ AutoVacLauncherMain(int argc, char *argv[])
 		 */
 		LWLockReleaseAll();
 		pgstat_report_wait_end();
-		AbortBufferIO();
 		UnlockBuffers();
 		/* this is probably dead code, but let's be safe: */
 		if (AuxProcessResourceOwner)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 69667f0eb4b..1c4244b6ec5 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -167,7 +167,6 @@ BackgroundWriterMain(void)
 		 */
 		LWLockReleaseAll();
 		ConditionVariableCancelSleep();
-		AbortBufferIO();
 		UnlockBuffers();
 		ReleaseAuxProcessResources(false);
 		AtEOXact_Buffers(false);
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index de0bbbfa791..c1ed6737df5 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -271,7 +271,6 @@ CheckpointerMain(void)
 		LWLockReleaseAll();
 		ConditionVariableCancelSleep();
 		pgstat_report_wait_end();
-		AbortBufferIO();
 		UnlockBuffers();
 		ReleaseAuxProcessResources(false);
 		AtEOXact_Buffers(false);
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 3113e8fbdd5..20e0807c60d 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -163,7 +163,6 @@ WalWriterMain(void)
 		LWLockReleaseAll();
 		ConditionVariableCancelSleep();
 		pgstat_report_wait_end();
-		AbortBufferIO();
 		UnlockBuffers();
 		ReleaseAuxProcessResources(false);
 		AtEOXact_Buffers(false);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b9af8a05989..0cdeb644e6e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -159,10 +159,6 @@ int			checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
 int			bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
 int			backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
 
-/* local state for StartBufferIO and related functions */
-static BufferDesc *InProgressBuf = NULL;
-static bool IsForInput;
-
 /* local state for LockBufferForCleanup */
 static BufferDesc *PinCountWaitBuf = NULL;
 
@@ -2689,7 +2685,6 @@ InitBufferPoolAccess(void)
 static void
 AtProcExit_Buffers(int code, Datum arg)
 {
-	AbortBufferIO();
 	UnlockBuffers();
 
 	CheckForBufferLeaks();
@@ -4618,7 +4613,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 {
 	uint32		buf_state;
 
-	Assert(!InProgressBuf);
+	ResourceOwnerEnlargeBufferIOs(CurrentResourceOwner);
 
 	for (;;)
 	{
@@ -4642,8 +4637,8 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 	buf_state |= BM_IO_IN_PROGRESS;
 	UnlockBufHdr(buf, buf_state);
 
-	InProgressBuf = buf;
-	IsForInput = forInput;
+	ResourceOwnerRememberBufferIO(CurrentResourceOwner,
+								  BufferDescriptorGetBuffer(buf));
 
 	return true;
 }
@@ -4669,8 +4664,6 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 {
 	uint32		buf_state;
 
-	Assert(buf == InProgressBuf);
-
 	buf_state = LockBufHdr(buf);
 
 	Assert(buf_state & BM_IO_IN_PROGRESS);
@@ -4682,13 +4675,14 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 	buf_state |= set_flag_bits;
 	UnlockBufHdr(buf, buf_state);
 
-	InProgressBuf = NULL;
+	ResourceOwnerForgetBufferIO(CurrentResourceOwner,
+								BufferDescriptorGetBuffer(buf));
 
 	ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
 }
 
 /*
- * AbortBufferIO: Clean up any active buffer I/O after an error.
+ * AbortBufferIO: Clean up active buffer I/O after an error.
  *
  *	All LWLocks we might have held have been released,
  *	but we haven't yet released buffer pins, so the buffer is still pinned.
@@ -4697,46 +4691,42 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  *	possible the error condition wasn't related to the I/O.
  */
 void
-AbortBufferIO(void)
+AbortBufferIO(Buffer buf)
 {
-	BufferDesc *buf = InProgressBuf;
+	BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
+	uint32		buf_state;
 
-	if (buf)
+	buf_state = LockBufHdr(buf_hdr);
+	Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
+
+	if (!(buf_state & BM_VALID))
 	{
-		uint32		buf_state;
-
-		buf_state = LockBufHdr(buf);
-		Assert(buf_state & BM_IO_IN_PROGRESS);
-		if (IsForInput)
-		{
-			Assert(!(buf_state & BM_DIRTY));
-
-			/* We'd better not think buffer is valid yet */
-			Assert(!(buf_state & BM_VALID));
-			UnlockBufHdr(buf, buf_state);
-		}
-		else
-		{
-			Assert(buf_state & BM_DIRTY);
-			UnlockBufHdr(buf, buf_state);
-			/* Issue notice if this is not the first failure... */
-			if (buf_state & BM_IO_ERROR)
-			{
-				/* Buffer is pinned, so we can read tag without spinlock */
-				char	   *path;
-
-				path = relpathperm(BufTagGetRelFileLocator(&buf->tag),
-								   BufTagGetForkNum(&buf->tag));
-				ereport(WARNING,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg("could not write block %u of %s",
-								buf->tag.blockNum, path),
-						 errdetail("Multiple failures --- write error might be permanent.")));
-				pfree(path);
-			}
-		}
-		TerminateBufferIO(buf, false, BM_IO_ERROR);
+		Assert(!(buf_state & BM_DIRTY));
+		UnlockBufHdr(buf_hdr, buf_state);
 	}
+	else
+	{
+		Assert(!(buf_state & BM_DIRTY));
+		UnlockBufHdr(buf_hdr, buf_state);
+
+		/* Issue notice if this is not the first failure... */
+		if (buf_state & BM_IO_ERROR)
+		{
+			/* Buffer is pinned, so we can read tag without spinlock */
+			char	   *path;
+
+			path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
+							   BufTagGetForkNum(&buf_hdr->tag));
+			ereport(WARNING,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not write block %u of %s",
+							buf_hdr->tag.blockNum, path),
+					 errdetail("Multiple failures --- write error might be permanent.")));
+			pfree(path);
+		}
+	}
+
+	TerminateBufferIO(buf_hdr, false, BM_IO_ERROR);
 }
 
 /*
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index 19b6241e45d..fccc59b39dd 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -121,6 +121,7 @@ typedef struct ResourceOwnerData
 
 	/* We have built-in support for remembering: */
 	ResourceArray bufferarr;	/* owned buffers */
+	ResourceArray bufferioarr;	/* in-progress buffer IO */
 	ResourceArray catrefarr;	/* catcache references */
 	ResourceArray catlistrefarr;	/* catcache-list pins */
 	ResourceArray relrefarr;	/* relcache references */
@@ -441,6 +442,7 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name)
 	}
 
 	ResourceArrayInit(&(owner->bufferarr), BufferGetDatum(InvalidBuffer));
+	ResourceArrayInit(&(owner->bufferioarr), BufferGetDatum(InvalidBuffer));
 	ResourceArrayInit(&(owner->catrefarr), PointerGetDatum(NULL));
 	ResourceArrayInit(&(owner->catlistrefarr), PointerGetDatum(NULL));
 	ResourceArrayInit(&(owner->relrefarr), PointerGetDatum(NULL));
@@ -517,6 +519,24 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
 
 	if (phase == RESOURCE_RELEASE_BEFORE_LOCKS)
 	{
+		/*
+		 * Abort failed buffer IO. AbortBufferIO()->TerminateBufferIO() calls
+		 * ResourceOwnerForgetBufferIOs(), so we just have to iterate till
+		 * there are none.
+		 *
+		 * Needs to be before we release buffer pins.
+		 *
+		 * During a commit, there shouldn't be any in-progress IO.
+		 */
+		while (ResourceArrayGetAny(&(owner->bufferioarr), &foundres))
+		{
+			Buffer		res = DatumGetBuffer(foundres);
+
+			if (isCommit)
+				elog(PANIC, "lost track of buffer IO on buffer %u", res);
+			AbortBufferIO(res);
+		}
+
 		/*
 		 * Release buffer pins.  Note that ReleaseBuffer will remove the
 		 * buffer entry from our array, so we just have to iterate till there
@@ -746,6 +766,7 @@ ResourceOwnerDelete(ResourceOwner owner)
 
 	/* And it better not own any resources, either */
 	Assert(owner->bufferarr.nitems == 0);
+	Assert(owner->bufferioarr.nitems == 0);
 	Assert(owner->catrefarr.nitems == 0);
 	Assert(owner->catlistrefarr.nitems == 0);
 	Assert(owner->relrefarr.nitems == 0);
@@ -775,6 +796,7 @@ ResourceOwnerDelete(ResourceOwner owner)
 
 	/* And free the object. */
 	ResourceArrayFree(&(owner->bufferarr));
+	ResourceArrayFree(&(owner->bufferioarr));
 	ResourceArrayFree(&(owner->catrefarr));
 	ResourceArrayFree(&(owner->catlistrefarr));
 	ResourceArrayFree(&(owner->relrefarr));
@@ -976,6 +998,44 @@ ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
 			 buffer, owner->name);
 }
 
+
+/*
+ * Make sure there is room for at least one more entry in a ResourceOwner's
+ * buffer array.
+ *
+ * This is separate from actually inserting an entry because if we run out
+ * of memory, it's critical to do so *before* acquiring the resource.
+ */
+void
+ResourceOwnerEnlargeBufferIOs(ResourceOwner owner)
+{
+	/* We used to allow pinning buffers without a resowner, but no more */
+	Assert(owner != NULL);
+	ResourceArrayEnlarge(&(owner->bufferioarr));
+}
+
+/*
+ * Remember that a buffer IO is owned by a ResourceOwner
+ *
+ * Caller must have previously done ResourceOwnerEnlargeBufferIOs()
+ */
+void
+ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
+{
+	ResourceArrayAdd(&(owner->bufferioarr), BufferGetDatum(buffer));
+}
+
+/*
+ * Forget that a buffer IO is owned by a ResourceOwner
+ */
+void
+ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
+{
+	if (!ResourceArrayRemove(&(owner->bufferioarr), BufferGetDatum(buffer)))
+		elog(PANIC, "buffer IO %d is not owned by resource owner %s",
+			 buffer, owner->name);
+}
+
 /*
  * Remember that a Local Lock is owned by a ResourceOwner
  *
-- 
2.38.0

>From 83d6b1997b81caf1f47e4cac87c399f1407fa455 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 26 Oct 2022 14:44:02 -0700
Subject: [PATCH v2 07/14] bufmgr: Move relation extension handling into
 [Bulk]ExtendRelationBuffered()

---
 src/include/storage/buf_internals.h   |   5 +
 src/include/storage/bufmgr.h          |  13 +
 src/backend/storage/buffer/bufmgr.c   | 546 ++++++++++++++++++--------
 src/backend/storage/buffer/localbuf.c | 134 ++++++-
 src/backend/utils/probes.d            |   6 +-
 doc/src/sgml/monitoring.sgml          |  11 +-
 6 files changed, 546 insertions(+), 169 deletions(-)

diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 4b1aeb5fd25..57800254d2d 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -420,6 +420,11 @@ extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
 												BlockNumber blockNum);
 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
 									BlockNumber blockNum, bool *foundPtr);
+extern BlockNumber BulkExtendLocalRelationBuffered(SMgrRelation smgr,
+												   ForkNumber fork,
+												   ReadBufferMode mode,
+												   uint32 *num_pages,
+												   Buffer *buffers);
 extern void MarkLocalBufferDirty(Buffer buffer);
 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
 									 ForkNumber forkNum,
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 2e1d7540fd0..4ecd5399966 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -132,6 +132,19 @@ extern void IncrBufferRefCount(Buffer buffer);
 extern void BufferCheckOneLocalPin(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 								   BlockNumber blockNum);
+extern Buffer ExtendRelationBuffered(Relation reln, struct SMgrRelationData *smgr,
+									 bool skip_extension_lock,
+									 char relpersistence,
+									 ForkNumber forkNum, ReadBufferMode mode,
+									 BufferAccessStrategy strategy);
+extern BlockNumber BulkExtendRelationBuffered(Relation rel, struct SMgrRelationData *smgr,
+											  bool skip_extension_lock,
+											  char relpersistence,
+											  ForkNumber fork, ReadBufferMode mode,
+											  BufferAccessStrategy strategy,
+											  uint32 *num_pages,
+											  uint32 num_locked_pages,
+											  Buffer *buffers);
 
 extern void InitBufferPoolAccess(void);
 extern void AtEOXact_Buffers(bool isCommit);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0cdeb644e6e..361ebc3ae26 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -48,6 +48,7 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
+#include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
@@ -459,6 +460,15 @@ static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence,
 								ForkNumber forkNum, BlockNumber blockNum,
 								ReadBufferMode mode, BufferAccessStrategy strategy,
 								bool *hit);
+static BlockNumber BulkExtendSharedRelationBuffered(Relation rel,
+													SMgrRelation smgr,
+													bool skip_extension_lock,
+													char relpersistence,
+													ForkNumber fork, ReadBufferMode mode,
+													BufferAccessStrategy strategy,
+													uint32 *num_pages,
+													uint32 num_locked_pages,
+													Buffer *buffers);
 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf);
@@ -793,6 +803,73 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * Convenience wrapper around BulkExtendRelationBuffered() extending by one
+ * block.
+ */
+Buffer
+ExtendRelationBuffered(Relation rel, struct SMgrRelationData *smgr,
+					   bool skip_extension_lock,
+					   char relpersistence, ForkNumber forkNum,
+					   ReadBufferMode mode, BufferAccessStrategy strategy)
+{
+	Buffer buf;
+	uint32 num_pages = 1;
+
+	BulkExtendRelationBuffered(rel, smgr, skip_extension_lock, relpersistence,
+							   forkNum, mode, strategy, &num_pages, num_pages, &buf);
+
+	return buf;
+}
+
+
+BlockNumber
+BulkExtendRelationBuffered(Relation rel,
+						   SMgrRelation smgr,
+						   bool skip_extension_lock,
+						   char relpersistence,
+						   ForkNumber fork, ReadBufferMode mode,
+						   BufferAccessStrategy strategy,
+						   uint32 *num_pages,
+						   uint32 num_locked_pages,
+						   Buffer *buffers)
+{
+	BlockNumber first_block;
+
+	Assert(rel != NULL || smgr != NULL);
+	Assert(rel != NULL || skip_extension_lock);
+
+	if (smgr == NULL)
+		smgr = RelationGetSmgr(rel);
+
+	TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
+										 smgr->smgr_rlocator.locator.spcOid,
+										 smgr->smgr_rlocator.locator.dbOid,
+										 smgr->smgr_rlocator.locator.relNumber,
+										 smgr->smgr_rlocator.backend,
+										 num_pages);
+
+	if (SmgrIsTemp(smgr))
+		first_block = BulkExtendLocalRelationBuffered(smgr,
+													  fork, mode,
+													  num_pages, buffers);
+	else
+		first_block = BulkExtendSharedRelationBuffered(rel, smgr,
+													   skip_extension_lock, relpersistence,
+													   fork, mode, strategy,
+													   num_pages, num_locked_pages,
+													   buffers);
+
+	TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
+										smgr->smgr_rlocator.locator.spcOid,
+										smgr->smgr_rlocator.locator.dbOid,
+										smgr->smgr_rlocator.locator.relNumber,
+										smgr->smgr_rlocator.backend,
+										num_pages,
+										first_block);
+
+	return first_block;
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
@@ -807,43 +884,32 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	BufferDesc *bufHdr;
 	Block		bufBlock;
 	bool		found;
-	bool		isExtend;
 	bool		isLocalBuf = SmgrIsTemp(smgr);
 
 	*hit = false;
 
+	/*
+	 * Backward compatibility path, most code should use
+	 * ExtendRelationBuffered() instead, as acquiring the extension lock
+	 * inside ExtendRelationBuffered() scales a lot better.
+	 */
+	if (unlikely(blockNum == P_NEW))
+		return ExtendRelationBuffered(NULL, smgr, true, relpersistence, forkNum, mode, strategy);
+
 	/* Make sure we will have room to remember the buffer pin */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
-	isExtend = (blockNum == P_NEW);
-
 	TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
 									   smgr->smgr_rlocator.locator.spcOid,
 									   smgr->smgr_rlocator.locator.dbOid,
 									   smgr->smgr_rlocator.locator.relNumber,
-									   smgr->smgr_rlocator.backend,
-									   isExtend);
-
-	/* Substitute proper block number if caller asked for P_NEW */
-	if (isExtend)
-	{
-		blockNum = smgrnblocks(smgr, forkNum);
-		/* Fail if relation is already at maximum possible length */
-		if (blockNum == P_NEW)
-			ereport(ERROR,
-					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-					 errmsg("cannot extend relation %s beyond %u blocks",
-							relpath(smgr->smgr_rlocator, forkNum),
-							P_NEW)));
-	}
+									   smgr->smgr_rlocator.backend);
 
 	if (isLocalBuf)
 	{
 		bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
 		if (found)
 			pgBufferUsage.local_blks_hit++;
-		else if (isExtend)
-			pgBufferUsage.local_blks_written++;
 		else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
 				 mode == RBM_ZERO_ON_ERROR)
 			pgBufferUsage.local_blks_read++;
@@ -858,8 +924,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 							 strategy, &found);
 		if (found)
 			pgBufferUsage.shared_blks_hit++;
-		else if (isExtend)
-			pgBufferUsage.shared_blks_written++;
 		else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
 				 mode == RBM_ZERO_ON_ERROR)
 			pgBufferUsage.shared_blks_read++;
@@ -870,168 +934,88 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	/* if it was already in the buffer pool, we're done */
 	if (found)
 	{
-		if (!isExtend)
-		{
-			/* Just need to update stats before we exit */
-			*hit = true;
-			VacuumPageHit++;
+		/* Just need to update stats before we exit */
+		*hit = true;
+		VacuumPageHit++;
 
-			if (VacuumCostActive)
-				VacuumCostBalance += VacuumCostPageHit;
+		if (VacuumCostActive)
+			VacuumCostBalance += VacuumCostPageHit;
 
-			TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
-											  smgr->smgr_rlocator.locator.spcOid,
-											  smgr->smgr_rlocator.locator.dbOid,
-											  smgr->smgr_rlocator.locator.relNumber,
-											  smgr->smgr_rlocator.backend,
-											  isExtend,
-											  found);
-
-			/*
-			 * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
-			 * locked on return.
-			 */
-			if (!isLocalBuf)
-			{
-				if (mode == RBM_ZERO_AND_LOCK)
-					LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
-								  LW_EXCLUSIVE);
-				else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
-					LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
-			}
-
-			return BufferDescriptorGetBuffer(bufHdr);
-		}
+		TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+										  smgr->smgr_rlocator.locator.spcOid,
+										  smgr->smgr_rlocator.locator.dbOid,
+										  smgr->smgr_rlocator.locator.relNumber,
+										  smgr->smgr_rlocator.backend,
+										  found);
 
 		/*
-		 * We get here only in the corner case where we are trying to extend
-		 * the relation but we found a pre-existing buffer marked BM_VALID.
-		 * This can happen because mdread doesn't complain about reads beyond
-		 * EOF (when zero_damaged_pages is ON) and so a previous attempt to
-		 * read a block beyond EOF could have left a "valid" zero-filled
-		 * buffer.  Unfortunately, we have also seen this case occurring
-		 * because of buggy Linux kernels that sometimes return an
-		 * lseek(SEEK_END) result that doesn't account for a recent write. In
-		 * that situation, the pre-existing buffer would contain valid data
-		 * that we don't want to overwrite.  Since the legitimate case should
-		 * always have left a zero-filled buffer, complain if not PageIsNew.
+		 * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
+		 * locked on return.
 		 */
-		bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
-		if (!PageIsNew((Page) bufBlock))
-			ereport(ERROR,
-					(errmsg("unexpected data beyond EOF in block %u of relation %s",
-							blockNum, relpath(smgr->smgr_rlocator, forkNum)),
-					 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
-
-		/*
-		 * We *must* do smgrextend before succeeding, else the page will not
-		 * be reserved by the kernel, and the next P_NEW call will decide to
-		 * return the same page.  Clear the BM_VALID bit, do the StartBufferIO
-		 * call that BufferAlloc didn't, and proceed.
-		 */
-		if (isLocalBuf)
+		if (!isLocalBuf)
 		{
-			/* Only need to adjust flags */
-			uint32		buf_state = pg_atomic_read_u32(&bufHdr->state);
-
-			Assert(buf_state & BM_VALID);
-			buf_state &= ~BM_VALID;
-			pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+			if (mode == RBM_ZERO_AND_LOCK)
+				LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+							  LW_EXCLUSIVE);
+			else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+				LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
 		}
-		else
-		{
-			/*
-			 * Loop to handle the very small possibility that someone re-sets
-			 * BM_VALID between our clearing it and StartBufferIO inspecting
-			 * it.
-			 */
-			do
-			{
-				uint32		buf_state = LockBufHdr(bufHdr);
 
-				Assert(buf_state & BM_VALID);
-				buf_state &= ~BM_VALID;
-				UnlockBufHdr(bufHdr, buf_state);
-			} while (!StartBufferIO(bufHdr, true));
-		}
+		return BufferDescriptorGetBuffer(bufHdr);
 	}
 
 	/*
 	 * if we have gotten to this point, we have allocated a buffer for the
 	 * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
 	 * if it's a shared buffer.
-	 *
-	 * Note: if smgrextend fails, we will end up with a buffer that is
-	 * allocated but not marked BM_VALID.  P_NEW will still select the same
-	 * block number (because the relation didn't get any longer on disk) and
-	 * so future attempts to extend the relation will find the same buffer (if
-	 * it's not been recycled) but come right back here to try smgrextend
-	 * again.
 	 */
 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));	/* spinlock not needed */
 
 	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
-	if (isExtend)
-	{
-		/* new buffers are zero-filled */
+	/*
+	 * Read in the page, unless the caller intends to overwrite it and
+	 * just wants us to allocate a buffer.
+	 */
+	if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
 		MemSet((char *) bufBlock, 0, BLCKSZ);
-		/* don't set checksum for all-zero page */
-		smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
-
-		/*
-		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
-		 * although we're essentially performing a write. At least on linux
-		 * doing so defeats the 'delayed allocation' mechanism, leading to
-		 * increased file fragmentation.
-		 */
-	}
 	else
 	{
-		/*
-		 * Read in the page, unless the caller intends to overwrite it and
-		 * just wants us to allocate a buffer.
-		 */
-		if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
-			MemSet((char *) bufBlock, 0, BLCKSZ);
-		else
+		instr_time	io_start,
+					io_time;
+
+		if (track_io_timing)
+			INSTR_TIME_SET_CURRENT(io_start);
+
+		smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+
+		if (track_io_timing)
 		{
-			instr_time	io_start,
-						io_time;
+			INSTR_TIME_SET_CURRENT(io_time);
+			INSTR_TIME_SUBTRACT(io_time, io_start);
+			pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+			INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+		}
 
-			if (track_io_timing)
-				INSTR_TIME_SET_CURRENT(io_start);
-
-			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
-
-			if (track_io_timing)
+		/* check for garbage data */
+		if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
+									PIV_LOG_WARNING | PIV_REPORT_STAT))
+		{
+			if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
 			{
-				INSTR_TIME_SET_CURRENT(io_time);
-				INSTR_TIME_SUBTRACT(io_time, io_start);
-				pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
-				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
-			}
-
-			/* check for garbage data */
-			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
-										PIV_LOG_WARNING | PIV_REPORT_STAT))
-			{
-				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
-				{
-					ereport(WARNING,
-							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s; zeroing out page",
-									blockNum,
-									relpath(smgr->smgr_rlocator, forkNum))));
-					MemSet((char *) bufBlock, 0, BLCKSZ);
-				}
-				else
-					ereport(ERROR,
-							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s",
-									blockNum,
-									relpath(smgr->smgr_rlocator, forkNum))));
+				ereport(WARNING,
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("invalid page in block %u of relation %s; zeroing out page",
+								blockNum,
+								relpath(smgr->smgr_rlocator, forkNum))));
+				MemSet((char *) bufBlock, 0, BLCKSZ);
 			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("invalid page in block %u of relation %s",
+								blockNum,
+								relpath(smgr->smgr_rlocator, forkNum))));
 		}
 	}
 
@@ -1074,7 +1058,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 									  smgr->smgr_rlocator.locator.dbOid,
 									  smgr->smgr_rlocator.locator.relNumber,
 									  smgr->smgr_rlocator.backend,
-									  isExtend,
 									  found);
 
 	return BufferDescriptorGetBuffer(bufHdr);
@@ -1617,6 +1600,251 @@ again:
 
 	return cur_buf;
 }
+
+/*
+ * Limit the number of pins a batch operation may additionally acquire, to
+ * avoid running out of pinnable buffers.
+ *
+ * One additional pin is always allowed, as otherwise the operation likely
+ * cannot be performed at all.
+ *
+ * The number of allowed pins for a backend is computed based on
+ * shared_buffers and the maximum number of connections possible. That's very
+ * pessimistic, but oustide of toy-sized shared_buffers it should allow
+ * sufficient pins.
+ */
+static void
+LimitAdditionalPins(uint32 *additional_pins)
+{
+	uint32 max_backends;
+	int max_proportional_pins;
+
+	if (*additional_pins <= 1)
+		return;
+
+	max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
+	max_proportional_pins = NBuffers / max_backends;
+
+	/*
+	 * Subtract the approximate number of buffers already pinned by this
+	 * backend. We get the number of "overflowed" pins for free, but don't
+	 * know the number of pins in PrivateRefCountArray. The cost of
+	 * calculating that exactly doesn't seem worth it, so just assume the max.
+	 */
+	max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
+
+	if (max_proportional_pins < 0)
+		max_proportional_pins = 1;
+
+	if (*additional_pins > max_proportional_pins)
+		*additional_pins = max_proportional_pins;
+}
+
+static BlockNumber
+BulkExtendSharedRelationBuffered(Relation rel,
+								 SMgrRelation smgr,
+								 bool skip_extension_lock,
+								 char relpersistence,
+								 ForkNumber fork, ReadBufferMode mode,
+								 BufferAccessStrategy strategy,
+								 uint32 *num_pages,
+								 uint32 num_locked_pages,
+								 Buffer *buffers)
+{
+	BlockNumber first_block;
+
+	LimitAdditionalPins(num_pages);
+
+	/*
+	 * FIXME: limit num_pages / buffers based on NBuffers / MaxBackends or
+	 * such. Also keep MAX_SIMUL_LWLOCKS in mind.
+	 */
+
+	pgBufferUsage.shared_blks_written += *num_pages;
+
+	/*
+	 * Acquire victim buffers for extension without holding extension
+	 * lock. Writing out victim buffers is the most expensive part of
+	 * extending the relation, particularly when doing so requires WAL
+	 * flushes. Zeroing out the buffers is also quite expensive, so do that
+	 * before holding the extension lock as well.
+	 *
+	 * These pages are pinned by us and not valid. While we hold the pin
+	 * they can't be acquired as victim buffers by another backend.
+	 */
+	for (uint32 i = 0; i < *num_pages; i++)
+	{
+		Block		buf_block;
+
+		buffers[i] = GetVictimBuffer(strategy);
+		buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
+
+		/* new buffers are zero-filled */
+		MemSet((char *) buf_block, 0, BLCKSZ);
+	}
+
+	if (!skip_extension_lock)
+		LockRelationForExtension(rel, ExclusiveLock);
+
+	first_block = smgrnblocks(smgr, fork);
+
+	/* Fail if relation is already at maximum possible length */
+	if ((uint64) first_block + *num_pages >= MaxBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend relation %s beyond %u blocks",
+						relpath(smgr->smgr_rlocator, fork),
+						MaxBlockNumber)));
+
+	/*
+	 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
+	 *
+	 * This needs to happen before we extend the relation, because as soon as
+	 * we do, other backends can start to read in those pages.
+	 */
+	for (int i = 0; i < *num_pages; i++)
+	{
+		Buffer		victim_buf = buffers[i];
+		BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
+		BufferTag	tag;
+		uint32		hash;
+		LWLock     *partition_lock;
+		int			existing_id;
+
+		InitBufferTag(&tag, &smgr->smgr_rlocator.locator, fork, first_block + i);
+		hash = BufTableHashCode(&tag);
+		partition_lock = BufMappingPartitionLock(hash);
+
+		LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+		existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
+
+		/*
+		 * We get here only in the corner case where we are trying to extend
+		 * the relation but we found a pre-existing buffer. This can happen
+		 * because a prior attempt at extending the relation failed, and
+		 * because mdread doesn't complain about reads beyond EOF (when
+		 * zero_damaged_pages is ON) and so a previous attempt to read a block
+		 * beyond EOF could have left a "valid" zero-filled buffer.
+		 * Unfortunately, we have also seen this case occurring because of
+		 * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
+		 * that doesn't account for a recent write. In that situation, the
+		 * pre-existing buffer would contain valid data that we don't want to
+		 * overwrite.  Since the legitimate cases should always have left a
+		 * zero-filled buffer, complain if not PageIsNew.
+		 */
+		if (existing_id >= 0)
+		{
+			BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
+			Block		buf_block;
+			bool		valid;
+
+			/*
+			 * Pin the existing buffer before releasing the partition lock,
+			 * preventing it from being evicted.
+			 */
+			valid = PinBuffer(existing_hdr, strategy);
+
+			LWLockRelease(partition_lock);
+
+			/*
+			 * The victim buffer we acquired peviously is clean and unused,
+			 * let it be found again quickly
+			 */
+			StrategyFreeBuffer(victim_buf_hdr);
+			UnpinBuffer(victim_buf_hdr);
+
+			buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
+			buf_block = BufHdrGetBlock(existing_hdr);
+
+			if (valid && !PageIsNew((Page) buf_block))
+				ereport(ERROR,
+						(errmsg("unexpected data beyond EOF in block %u of relation %s",
+								existing_hdr->tag.blockNum, relpath(smgr->smgr_rlocator, fork)),
+						 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
+
+			/*
+			 * We *must* do smgr[zero]extend before succeeding, else the page
+			 * will not be reserved by the kernel, and the next P_NEW call
+			 * will decide to return the same page.  Clear the BM_VALID bit,
+			 * do StartBufferIO() and proceed.
+			 *
+			 * Loop to handle the very small possibility that someone re-sets
+			 * BM_VALID between our clearing it and StartBufferIO inspecting
+			 * it.
+			 */
+			do
+			{
+				uint32		buf_state = LockBufHdr(existing_hdr);
+
+				buf_state &= ~BM_VALID;
+				UnlockBufHdr(existing_hdr, buf_state);
+			} while (!StartBufferIO(existing_hdr, true));
+		}
+		else
+		{
+			uint32		buf_state;
+
+			buf_state = LockBufHdr(victim_buf_hdr);
+
+			/* some sanity checks while we hold the buffer header lock */
+			Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
+			Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
+
+			victim_buf_hdr->tag = tag;
+
+			buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+			if (relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
+				buf_state |= BM_PERMANENT;
+
+			UnlockBufHdr(victim_buf_hdr, buf_state);
+
+			LWLockRelease(partition_lock);
+
+			/* XXX: could combine the locked operations in it with the above */
+			StartBufferIO(victim_buf_hdr, true);
+		}
+	}
+
+	/*
+	 * Note: if smgzerorextend fails, we will end up with buffers that are
+	 * allocated but not marked BM_VALID.  The next relation extension will
+	 * still select the same block number (because the relation didn't get any
+	 * longer on disk) and so future attempts to extend the relation will find
+	 * the same buffers (if they have not been recycled) but come right back
+	 * here to try smgrzeroextend again.
+	 *
+	 * We don't need to set checksum for all-zero pages.
+	 */
+	smgrzeroextend(smgr, fork, first_block, *num_pages, false);
+
+	/*
+	 * Release the file-extension lock; it's now OK for someone else to extend
+	 * the relation some more.
+	 *
+	 * We remove IO_IN_PROGRESS after this, as zeroing the buffer contents and
+	 * waking up waiting backends waiting can take noticeable time.
+	 */
+	if (!skip_extension_lock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	/* Set BM_VALID, terminate IO, and wake up any waiters */
+	for (int i = 0; i < *num_pages; i++)
+	{
+		Buffer		buf = buffers[i];
+		BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
+
+		if (i < num_locked_pages &&
+			(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK))
+			LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
+
+		TerminateBufferIO(buf_hdr, false, BM_VALID);
+	}
+
+	return first_block;
+
+}
+
 /*
  * MarkBufferDirty
  *
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index b1d0c309918..0b5bc0017f1 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -48,6 +48,9 @@ static int	nextFreeLocalBufId = 0;
 
 static HTAB *LocalBufHash = NULL;
 
+/* number of local buffers pinned at least once */
+static int	NLocalPinnedBuffers = 0;
+
 
 static void InitLocalBuffers(void);
 static Block GetLocalBufferStorage(void);
@@ -270,6 +273,132 @@ GetLocalVictimBuffer(void)
 	return BufferDescriptorGetBuffer(bufHdr);
 }
 
+/* see LimitAdditionalPins() */
+static void
+LimitAdditionalLocalPins(uint32 *additional_pins)
+{
+	uint32 max_pins;
+
+	if (*additional_pins <= 1)
+		return;
+
+	/*
+	 * In contrast to LimitAdditionalPins() other backends don't play a role
+	 * here. We can allow up to NLocBuffer pins in total.
+	 */
+	max_pins = (NLocBuffer - NLocalPinnedBuffers);
+
+	if (*additional_pins >= max_pins)
+		*additional_pins = max_pins;
+}
+
+BlockNumber
+BulkExtendLocalRelationBuffered(SMgrRelation smgr,
+								ForkNumber fork,
+								ReadBufferMode mode,
+								uint32 *num_pages,
+								Buffer *buffers)
+{
+	BlockNumber first_block;
+
+	/* Initialize local buffers if first request in this session */
+	if (LocalBufHash == NULL)
+		InitLocalBuffers();
+
+	LimitAdditionalLocalPins(num_pages);
+
+	pgBufferUsage.temp_blks_written += *num_pages;
+
+	for (uint32 i = 0; i < *num_pages; i++)
+	{
+		BufferDesc *buf_hdr;
+		Block		buf_block;
+
+		buffers[i] = GetLocalVictimBuffer();
+		buf_hdr = GetLocalBufferDescriptor(-(buffers[i] + 1));
+		buf_block = LocalBufHdrGetBlock(buf_hdr);
+
+		/* new buffers are zero-filled */
+		MemSet((char *) buf_block, 0, BLCKSZ);
+	}
+
+	first_block = smgrnblocks(smgr, fork);
+
+	/* Fail if relation is already at maximum possible length */
+	if ((uint64) first_block + *num_pages >= MaxBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend relation %s beyond %u blocks",
+						relpath(smgr->smgr_rlocator, fork),
+						MaxBlockNumber)));
+
+	for (int i = 0; i < *num_pages; i++)
+	{
+		int			victim_buf_id;
+		BufferDesc *victim_buf_hdr;
+		BufferTag	tag;
+		LocalBufferLookupEnt *hresult;
+		bool		found;
+
+		victim_buf_id = -(buffers[i] + 1);
+		victim_buf_hdr = GetLocalBufferDescriptor(victim_buf_id);
+
+		InitBufferTag(&tag, &smgr->smgr_rlocator.locator, fork, first_block + i);
+
+		hresult = (LocalBufferLookupEnt *)
+			hash_search(LocalBufHash, (void *) &tag, HASH_ENTER, &found);
+		if (found)
+		{
+			BufferDesc *existing_hdr = GetLocalBufferDescriptor(hresult->id);
+			uint32		buf_state;
+
+			UnpinLocalBuffer(BufferDescriptorGetBuffer(victim_buf_hdr));
+
+			existing_hdr = GetLocalBufferDescriptor(hresult->id);
+			PinLocalBuffer(existing_hdr, false);
+			buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
+
+			buf_state = pg_atomic_read_u32(&existing_hdr->state);
+			Assert(buf_state & BM_TAG_VALID);
+			Assert(!(buf_state & BM_DIRTY));
+			buf_state &= BM_VALID;
+			pg_atomic_unlocked_write_u32(&existing_hdr->state, buf_state);
+		}
+		else
+		{
+			uint32		buf_state = pg_atomic_read_u32(&victim_buf_hdr->state);
+
+			Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
+
+			victim_buf_hdr->tag = tag;
+
+			buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+
+			pg_atomic_unlocked_write_u32(&victim_buf_hdr->state, buf_state);
+
+			hresult->id = victim_buf_id;
+		}
+	}
+
+	/* actually extend relation */
+	smgrzeroextend(smgr, fork, first_block, *num_pages, false);
+
+	for (int i = 0; i < *num_pages; i++)
+	{
+		Buffer		buf = buffers[i];
+		BufferDesc *buf_hdr;
+		uint32		buf_state;
+
+		buf_hdr = GetLocalBufferDescriptor(-(buf + 1));
+
+		buf_state = pg_atomic_read_u32(&buf_hdr->state);
+		buf_state |= BM_VALID;
+		pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+	}
+
+	return first_block;
+}
+
 /*
  * MarkLocalBufferDirty -
  *	  mark a local buffer dirty
@@ -486,6 +615,7 @@ PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
 
 	if (LocalRefCount[bufid] == 0)
 	{
+		NLocalPinnedBuffers++;
 		if (adjust_usagecount &&
 			BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
 		{
@@ -507,9 +637,11 @@ UnpinLocalBuffer(Buffer buffer)
 
 	Assert(BufferIsLocal(buffer));
 	Assert(LocalRefCount[buffid] > 0);
+	Assert(NLocalPinnedBuffers > 0);
 
 	ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
-	LocalRefCount[buffid]--;
+	if (--LocalRefCount[buffid] == 0)
+		NLocalPinnedBuffers--;
 }
 
 /*
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index c064d679e94..f18a8fbaed0 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -55,10 +55,12 @@ provider postgresql {
 	probe sort__start(int, bool, int, int, bool, int);
 	probe sort__done(bool, long);
 
-	probe buffer__read__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool);
-	probe buffer__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool, bool);
+	probe buffer__read__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int);
+	probe buffer__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool);
 	probe buffer__flush__start(ForkNumber, BlockNumber, Oid, Oid, Oid);
 	probe buffer__flush__done(ForkNumber, BlockNumber, Oid, Oid, Oid);
+	probe buffer__extend__start(ForkNumber, Oid, Oid, Oid, int, int);
+	probe buffer__extend__done(ForkNumber, Oid, Oid, Oid, int, int, BlockNumber);
 
 	probe buffer__checkpoint__start(int);
 	probe buffer__checkpoint__sync__start();
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index cf220c3bcb4..99a78150814 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -7379,7 +7379,7 @@ FROM pg_stat_get_backend_idset() AS backendid;
     </row>
     <row>
      <entry><literal>buffer-read-start</literal></entry>
-     <entry><literal>(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool)</literal></entry>
+     <entry><literal>(ForkNumber, BlockNumber, Oid, Oid, Oid, int)</literal></entry>
      <entry>Probe that fires when a buffer read is started.
       arg0 and arg1 contain the fork and block numbers of the page (but
       arg1 will be -1 if this is a relation extension request).
@@ -7387,12 +7387,11 @@ FROM pg_stat_get_backend_idset() AS backendid;
       identifying the relation.
       arg5 is the ID of the backend which created the temporary relation for a
       local buffer, or <symbol>InvalidBackendId</symbol> (-1) for a shared buffer.
-      arg6 is true for a relation extension request, false for normal
-      read.</entry>
+      </entry>
     </row>
     <row>
      <entry><literal>buffer-read-done</literal></entry>
-     <entry><literal>(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool, bool)</literal></entry>
+     <entry><literal>(ForkNumber, BlockNumber, Oid, Oid, Oid, int, bool)</literal></entry>
      <entry>Probe that fires when a buffer read is complete.
       arg0 and arg1 contain the fork and block numbers of the page (if this
       is a relation extension request, arg1 now contains the block number
@@ -7401,9 +7400,7 @@ FROM pg_stat_get_backend_idset() AS backendid;
       identifying the relation.
       arg5 is the ID of the backend which created the temporary relation for a
       local buffer, or <symbol>InvalidBackendId</symbol> (-1) for a shared buffer.
-      arg6 is true for a relation extension request, false for normal
-      read.
-      arg7 is true if the buffer was found in the pool, false if not.</entry>
+      arg6 is true if the buffer was found in the pool, false if not.</entry>
     </row>
     <row>
      <entry><literal>buffer-flush-start</literal></entry>
-- 
2.38.0

>From 2ffc7854ecd27a188d8e6d3dd3191e6fb93f6e61 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 24 Oct 2022 12:18:18 -0700
Subject: [PATCH v2 08/14] Convert a few places to ExtendRelationBuffered

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/access/brin/brin.c         | 13 ++++++----
 src/backend/access/brin/brin_pageops.c |  4 +++
 src/backend/access/brin/brin_revmap.c  | 17 ++++--------
 src/backend/access/gin/gininsert.c     | 14 +++++-----
 src/backend/access/gin/ginutil.c       | 15 +++--------
 src/backend/access/gin/ginvacuum.c     |  8 ++++++
 src/backend/access/gist/gist.c         |  6 +++--
 src/backend/access/gist/gistutil.c     | 16 +++---------
 src/backend/access/gist/gistvacuum.c   |  3 +++
 src/backend/access/nbtree/nbtpage.c    | 36 ++++++++------------------
 src/backend/access/nbtree/nbtree.c     |  3 +++
 src/backend/access/spgist/spgutils.c   | 15 +++--------
 contrib/bloom/blutils.c                | 14 +++-------
 13 files changed, 70 insertions(+), 94 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index de1427a1e0e..1810f7ebfef 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -829,9 +829,11 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	 * whole relation will be rolled back.
 	 */
 
-	meta = ReadBuffer(index, P_NEW);
+	meta = ExtendRelationBuffered(index, NULL, true,
+								  index->rd_rel->relpersistence,
+								  MAIN_FORKNUM, RBM_ZERO_AND_LOCK,
+								  NULL);
 	Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
-	LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
 
 	brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
 					   BRIN_CURRENT_VERSION);
@@ -896,9 +898,10 @@ brinbuildempty(Relation index)
 	Buffer		metabuf;
 
 	/* An empty BRIN index has a metapage only. */
-	metabuf =
-		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+	metabuf = ExtendRelationBuffered(index, NULL, true,
+									 index->rd_rel->relpersistence,
+									 INIT_FORKNUM, RBM_ZERO_AND_LOCK,
+									 NULL);
 
 	/* Initialize and xlog metabuffer. */
 	START_CRIT_SECTION();
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index ad5a89bd051..b578d259545 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -730,6 +730,10 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 			 * There's not enough free space in any existing index page,
 			 * according to the FSM: extend the relation to obtain a shiny new
 			 * page.
+			 *
+			 * XXX: It's likely possible to use RBM_ZERO_AND_LOCK here,
+			 * which'd avoid the need to hold the extension lock during buffer
+			 * reclaim.
 			 */
 			if (!RELATION_IS_LOCAL(irel))
 			{
diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 7fc5226bf74..7ae9cecf43d 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -538,7 +538,6 @@ revmap_physical_extend(BrinRevmap *revmap)
 	BlockNumber mapBlk;
 	BlockNumber nblocks;
 	Relation	irel = revmap->rm_irel;
-	bool		needLock = !RELATION_IS_LOCAL(irel);
 
 	/*
 	 * Lock the metapage. This locks out concurrent extensions of the revmap,
@@ -570,10 +569,10 @@ revmap_physical_extend(BrinRevmap *revmap)
 	}
 	else
 	{
-		if (needLock)
-			LockRelationForExtension(irel, ExclusiveLock);
-
-		buf = ReadBuffer(irel, P_NEW);
+		buf = ExtendRelationBuffered(irel, NULL, false,
+									 irel->rd_rel->relpersistence,
+									 MAIN_FORKNUM, RBM_ZERO_AND_LOCK,
+									 NULL);
 		if (BufferGetBlockNumber(buf) != mapBlk)
 		{
 			/*
@@ -582,17 +581,11 @@ revmap_physical_extend(BrinRevmap *revmap)
 			 * up and have caller start over.  We will have to evacuate that
 			 * page from under whoever is using it.
 			 */
-			if (needLock)
-				UnlockRelationForExtension(irel, ExclusiveLock);
 			LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
-			ReleaseBuffer(buf);
+			UnlockReleaseBuffer(buf);
 			return;
 		}
-		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 		page = BufferGetPage(buf);
-
-		if (needLock)
-			UnlockRelationForExtension(irel, ExclusiveLock);
 	}
 
 	/* Check that it's a regular block (or an empty page) */
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index d5d748009ea..ea65b460c72 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -440,12 +440,14 @@ ginbuildempty(Relation index)
 				MetaBuffer;
 
 	/* An empty GIN index has two pages. */
-	MetaBuffer =
-		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE);
-	RootBuffer =
-		ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE);
+	MetaBuffer = ExtendRelationBuffered(index, NULL, true,
+										index->rd_rel->relpersistence,
+										INIT_FORKNUM, RBM_ZERO_AND_LOCK,
+										NULL);
+	RootBuffer = ExtendRelationBuffered(index, NULL, true,
+										index->rd_rel->relpersistence,
+										INIT_FORKNUM, RBM_ZERO_AND_LOCK,
+										NULL);
 
 	/* Initialize and xlog metabuffer and root buffer. */
 	START_CRIT_SECTION();
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index e7cc452a8aa..c0362ab384c 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -298,7 +298,6 @@ Buffer
 GinNewBuffer(Relation index)
 {
 	Buffer		buffer;
-	bool		needLock;
 
 	/* First, try to get a page from FSM */
 	for (;;)
@@ -326,16 +325,10 @@ GinNewBuffer(Relation index)
 		ReleaseBuffer(buffer);
 	}
 
-	/* Must extend the file */
-	needLock = !RELATION_IS_LOCAL(index);
-	if (needLock)
-		LockRelationForExtension(index, ExclusiveLock);
-
-	buffer = ReadBuffer(index, P_NEW);
-	LockBuffer(buffer, GIN_EXCLUSIVE);
-
-	if (needLock)
-		UnlockRelationForExtension(index, ExclusiveLock);
+	buffer = ExtendRelationBuffered(index, NULL, false,
+									index->rd_rel->relpersistence,
+									MAIN_FORKNUM, RBM_ZERO_AND_LOCK,
+									NULL);
 
 	return buffer;
 }
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index e5d310d8362..13251d7e07d 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -736,6 +736,10 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	 */
 	needLock = !RELATION_IS_LOCAL(index);
 
+	/*
+	 * FIXME: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
+	 * think this is still required?
+	 */
 	if (needLock)
 		LockRelationForExtension(index, ExclusiveLock);
 	npages = RelationGetNumberOfBlocks(index);
@@ -786,6 +790,10 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	stats->pages_free = totFreePages;
 
+	/*
+	 * FIXME: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
+	 * think this is still required?
+	 */
 	if (needLock)
 		LockRelationForExtension(index, ExclusiveLock);
 	stats->num_pages = RelationGetNumberOfBlocks(index);
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index ba394f08f61..6dfb07a45b7 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -133,8 +133,10 @@ gistbuildempty(Relation index)
 	Buffer		buffer;
 
 	/* Initialize the root page */
-	buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+	buffer = ExtendRelationBuffered(index, NULL, true,
+									index->rd_rel->relpersistence,
+									INIT_FORKNUM, RBM_ZERO_AND_LOCK,
+									NULL);
 
 	/* Initialize and xlog buffer */
 	START_CRIT_SECTION();
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 56451fede10..c8d57f06d20 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -824,7 +824,6 @@ Buffer
 gistNewBuffer(Relation r)
 {
 	Buffer		buffer;
-	bool		needLock;
 
 	/* First, try to get a page from FSM */
 	for (;;)
@@ -877,17 +876,10 @@ gistNewBuffer(Relation r)
 		ReleaseBuffer(buffer);
 	}
 
-	/* Must extend the file */
-	needLock = !RELATION_IS_LOCAL(r);
-
-	if (needLock)
-		LockRelationForExtension(r, ExclusiveLock);
-
-	buffer = ReadBuffer(r, P_NEW);
-	LockBuffer(buffer, GIST_EXCLUSIVE);
-
-	if (needLock)
-		UnlockRelationForExtension(r, ExclusiveLock);
+	buffer = ExtendRelationBuffered(r, NULL, false,
+									r->rd_rel->relpersistence,
+									MAIN_FORKNUM, RBM_ZERO_AND_LOCK,
+									NULL);
 
 	return buffer;
 }
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 3f60d3274d2..cc711b04986 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -203,6 +203,9 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * we must already have processed any tuples due to be moved into such a
 	 * page.
 	 *
+	 * FIXME: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
+	 * think this issue still exists?
+	 *
 	 * We can skip locking for new or temp relations, however, since no one
 	 * else could be accessing them.
 	 */
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 3feee28d197..1733f2a18ed 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -881,7 +881,6 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 	}
 	else
 	{
-		bool		needLock;
 		Page		page;
 
 		Assert(access == BT_WRITE);
@@ -962,31 +961,18 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		}
 
 		/*
-		 * Extend the relation by one page.
-		 *
-		 * We have to use a lock to ensure no one else is extending the rel at
-		 * the same time, else we will both try to initialize the same new
-		 * page.  We can skip locking for new or temp relations, however,
-		 * since no one else could be accessing them.
+		 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or
+		 * we risk a race condition against btvacuumscan --- see comments
+		 * therein. This forces us to repeat the valgrind request that
+		 * _bt_lockbuf() otherwise would make, as we can't use _bt_lockbuf()
+		 * without introducing a race.
 		 */
-		needLock = !RELATION_IS_LOCAL(rel);
-
-		if (needLock)
-			LockRelationForExtension(rel, ExclusiveLock);
-
-		buf = ReadBuffer(rel, P_NEW);
-
-		/* Acquire buffer lock on new page */
-		_bt_lockbuf(rel, buf, BT_WRITE);
-
-		/*
-		 * Release the file-extension lock; it's now OK for someone else to
-		 * extend the relation some more.  Note that we cannot release this
-		 * lock before we have buffer lock on the new page, or we risk a race
-		 * condition against btvacuumscan --- see comments therein.
-		 */
-		if (needLock)
-			UnlockRelationForExtension(rel, ExclusiveLock);
+		buf = ExtendRelationBuffered(rel, NULL, false,
+									 rel->rd_rel->relpersistence,
+									 MAIN_FORKNUM, RBM_ZERO_AND_LOCK,
+									 NULL);
+		if (!RelationUsesLocalBuffers(rel))
+			VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
 
 		/* Initialize the new page before returning it */
 		page = BufferGetPage(buf);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1cc88da032d..383f18a999e 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -969,6 +969,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 * write-lock on the left page before it adds a right page, so we must
 	 * already have processed any tuples due to be moved into such a page.
 	 *
+	 * FIXME: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
+	 * think this issue still exists?
+	 *
 	 * We can skip locking for new or temp relations, however, since no one
 	 * else could be accessing them.
 	 */
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 3761f2c193b..f459b45eaa9 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -365,7 +365,6 @@ Buffer
 SpGistNewBuffer(Relation index)
 {
 	Buffer		buffer;
-	bool		needLock;
 
 	/* First, try to get a page from FSM */
 	for (;;)
@@ -405,16 +404,10 @@ SpGistNewBuffer(Relation index)
 		ReleaseBuffer(buffer);
 	}
 
-	/* Must extend the file */
-	needLock = !RELATION_IS_LOCAL(index);
-	if (needLock)
-		LockRelationForExtension(index, ExclusiveLock);
-
-	buffer = ReadBuffer(index, P_NEW);
-	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
-	if (needLock)
-		UnlockRelationForExtension(index, ExclusiveLock);
+	buffer = ExtendRelationBuffered(index, NULL, false,
+									index->rd_rel->relpersistence,
+									MAIN_FORKNUM, RBM_ZERO_AND_LOCK,
+									NULL);
 
 	return buffer;
 }
diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index a6d9f09f315..f700e7c9f0b 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -353,7 +353,6 @@ Buffer
 BloomNewBuffer(Relation index)
 {
 	Buffer		buffer;
-	bool		needLock;
 
 	/* First, try to get a page from FSM */
 	for (;;)
@@ -387,15 +386,10 @@ BloomNewBuffer(Relation index)
 	}
 
 	/* Must extend the file */
-	needLock = !RELATION_IS_LOCAL(index);
-	if (needLock)
-		LockRelationForExtension(index, ExclusiveLock);
-
-	buffer = ReadBuffer(index, P_NEW);
-	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
-	if (needLock)
-		UnlockRelationForExtension(index, ExclusiveLock);
+	buffer = ExtendRelationBuffered(index, NULL, false,
+									index->rd_rel->relpersistence,
+									MAIN_FORKNUM, RBM_ZERO_AND_LOCK,
+									NULL);
 
 	return buffer;
 }
-- 
2.38.0

>From ede000ac82133434ebe293506d5d434f6f360904 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sun, 23 Oct 2022 14:44:43 -0700
Subject: [PATCH v2 09/14] heapam: Add num_pages to RelationGetBufferForTuple()

This will be useful to compute the number of pages to extend a relation by.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/access/hio.h         | 14 +++++++-
 src/backend/access/heap/heapam.c | 60 +++++++++++++++++++++++++++++---
 src/backend/access/heap/hio.c    |  8 ++++-
 3 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/src/include/access/hio.h b/src/include/access/hio.h
index 3f20b585326..dd61462d988 100644
--- a/src/include/access/hio.h
+++ b/src/include/access/hio.h
@@ -30,6 +30,17 @@ typedef struct BulkInsertStateData
 {
 	BufferAccessStrategy strategy;	/* our BULKWRITE strategy object */
 	Buffer		current_buf;	/* current insertion target page */
+
+	/*
+	 * State for bulk extensions. Further pages that were unused at the time
+	 * of the extension. They might be in use by the time we use them though,
+	 * so rechecks are needed.
+	 *
+	 * FIXME: Perhaps these should live in RelationData instead, alongside the
+	 * targetblock?
+	 */
+	BlockNumber	next_free;
+	BlockNumber	last_free;
 } BulkInsertStateData;
 
 
@@ -38,6 +49,7 @@ extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
 extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
 										Buffer otherBuffer, int options,
 										BulkInsertStateData *bistate,
-										Buffer *vmbuffer, Buffer *vmbuffer_other);
+										Buffer *vmbuffer, Buffer *vmbuffer_other,
+										int num_pages);
 
 #endif							/* HIO_H */
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 63c4f01f0fd..e73a750892c 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1979,6 +1979,8 @@ GetBulkInsertState(void)
 	bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
 	bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
 	bistate->current_buf = InvalidBuffer;
+	bistate->next_free = InvalidBlockNumber;
+	bistate->last_free = InvalidBlockNumber;
 	return bistate;
 }
 
@@ -2052,7 +2054,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	 */
 	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
 									   InvalidBuffer, options, bistate,
-									   &vmbuffer, NULL);
+									   &vmbuffer, NULL,
+									   0);
 
 	/*
 	 * We're about to do the actual insert -- but check for conflict first, to
@@ -2255,6 +2258,33 @@ heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
 		return tup;
 }
 
+/*
+ * Helper for heap_multi_insert() that computes the number of full pages s
+ */
+static int
+heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
+{
+	size_t		page_avail;
+	int			npages = 0;
+
+	page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
+	npages++;
+
+	for (int i = done; i < ntuples; i++)
+	{
+		size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
+
+		if (page_avail < tup_sz)
+		{
+			npages++;
+			page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
+		}
+		page_avail -= tup_sz;
+	}
+
+	return npages;
+}
+
 /*
  *	heap_multi_insert	- insert multiple tuples into a heap
  *
@@ -2281,6 +2311,9 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	Size		saveFreeSpace;
 	bool		need_tuple_data = RelationIsLogicallyLogged(relation);
 	bool		need_cids = RelationIsAccessibleInLogicalDecoding(relation);
+	bool		starting_with_empty_page = false;
+	int			npages = 0;
+	int			npages_used = 0;
 
 	/* currently not needed (thus unsupported) for heap_multi_insert() */
 	Assert(!(options & HEAP_INSERT_NO_LOGICAL));
@@ -2331,13 +2364,30 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	while (ndone < ntuples)
 	{
 		Buffer		buffer;
-		bool		starting_with_empty_page;
 		bool		all_visible_cleared = false;
 		bool		all_frozen_set = false;
 		int			nthispage;
 
 		CHECK_FOR_INTERRUPTS();
 
+		/*
+		 * Compute number of pages needed to insert tuples in the worst
+		 * case. This will be used to determine how much to extend the
+		 * relation by in RelationGetBufferForTuple(), if needed.  If we
+		 * filled a prior page from scratch, we can just update our last
+		 * computation, but if we started with a partially filled page
+		 * recompute from scratch, the number of potentially required pages
+		 * can vary due to tuples needing to fit onto the page, page headers
+		 * etc.
+		 */
+		if (ndone == 0 || !starting_with_empty_page)
+		{
+			npages = heap_multi_insert_pages(heaptuples, ndone, ntuples, saveFreeSpace);
+			npages_used = 0;
+		}
+		else
+			npages_used++;
+
 		/*
 		 * Find buffer where at least the next tuple will fit.  If the page is
 		 * all-visible, this will also pin the requisite visibility map page.
@@ -2347,7 +2397,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 		 */
 		buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
 										   InvalidBuffer, options, bistate,
-										   &vmbuffer, NULL);
+										   &vmbuffer, NULL,
+										   npages - npages_used);
 		page = BufferGetPage(buffer);
 
 		starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
@@ -3770,7 +3821,8 @@ l2:
 				/* It doesn't fit, must use RelationGetBufferForTuple. */
 				newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
 												   buffer, 0, NULL,
-												   &vmbuffer_new, &vmbuffer);
+												   &vmbuffer_new, &vmbuffer,
+												   0);
 				/* We're all done. */
 				break;
 			}
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 7479212d4e0..65886839e70 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -275,6 +275,11 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
  *	Returns pinned and exclusive-locked buffer of a page in given relation
  *	with free space >= given len.
  *
+ *	If num_pages is > 1, the relation will be extended by at least that many
+ *	pages when we decide to extend the relation. This is more efficient for
+ *	callers that know they will need multiple pages
+ *	(e.g. heap_multi_insert()).
+ *
  *	If otherBuffer is not InvalidBuffer, then it references a previously
  *	pinned buffer of another page in the same relation; on return, this
  *	buffer will also be exclusive-locked.  (This case is used by heap_update;
@@ -333,7 +338,8 @@ Buffer
 RelationGetBufferForTuple(Relation relation, Size len,
 						  Buffer otherBuffer, int options,
 						  BulkInsertState bistate,
-						  Buffer *vmbuffer, Buffer *vmbuffer_other)
+						  Buffer *vmbuffer, Buffer *vmbuffer_other,
+						  int num_pages)
 {
 	bool		use_fsm = !(options & HEAP_INSERT_SKIP_FSM);
 	Buffer		buffer = InvalidBuffer;
-- 
2.38.0

>From 57b2af7bd09b8baa5e0d0b0cca07816902b9e759 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 26 Oct 2022 14:14:11 -0700
Subject: [PATCH v2 10/14] hio: Use BulkExtendRelationBuffered()

---
 src/backend/access/heap/hio.c | 181 +++++++++++++++++++++++++++++++---
 1 file changed, 170 insertions(+), 11 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 65886839e70..f40439d2778 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -24,6 +24,7 @@
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
 
+#define NEW_EXTEND
 
 /*
  * RelationPutHeapTuple - place tuple at specified page
@@ -185,6 +186,8 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
 	}
 }
 
+#ifndef NEW_EXTEND
+
 /*
  * Extend a relation by multiple blocks to avoid future contention on the
  * relation extension lock.  Our goal is to pre-extend the relation by an
@@ -268,6 +271,7 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
 	 */
 	FreeSpaceMapVacuumRange(relation, firstBlock, blockNum + 1);
 }
+#endif
 
 /*
  * RelationGetBufferForTuple
@@ -354,6 +358,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
 
 	len = MAXALIGN(len);		/* be conservative */
 
+	if (num_pages <= 0)
+		num_pages = 1;
+
 	/* Bulk insert is not supported for updates, only inserts. */
 	Assert(otherBuffer == InvalidBuffer || !bistate);
 
@@ -558,18 +565,46 @@ loop:
 			ReleaseBuffer(buffer);
 		}
 
-		/* Without FSM, always fall out of the loop and extend */
-		if (!use_fsm)
-			break;
+		if (bistate
+			&& bistate->next_free != InvalidBlockNumber
+			&& bistate->next_free <= bistate->last_free)
+		{
+			/*
+			 * We bulk extended the relation before, and there are still some
+			 * unused pages from that extension, so we don't need to look in
+			 * the FSM for a new page. But do record the free space from the
+			 * last page, somebody might insert narrower tuples later.
+			 */
+			if (use_fsm)
+				RecordPageWithFreeSpace(relation, targetBlock, pageFreeSpace);
 
-		/*
-		 * Update FSM as to condition of this page, and ask for another page
-		 * to try.
-		 */
-		targetBlock = RecordAndGetPageWithFreeSpace(relation,
-													targetBlock,
-													pageFreeSpace,
-													targetFreeSpace);
+			Assert(bistate->last_free != InvalidBlockNumber &&
+				   bistate->next_free <= bistate->last_free);
+			targetBlock = bistate->next_free;
+			if (bistate->next_free >= bistate->last_free)
+			{
+				bistate->next_free = InvalidBlockNumber;
+				bistate->last_free = InvalidBlockNumber;
+			}
+			else
+				bistate->next_free++;
+		}
+		else if (!use_fsm)
+		{
+			/* Without FSM, always fall out of the loop and extend */
+			break;
+		}
+		else
+		{
+			/*
+			 * Update FSM as to condition of this page, and ask for another page
+			 * to try.
+			 */
+			targetBlock = RecordAndGetPageWithFreeSpace(relation,
+														targetBlock,
+														pageFreeSpace,
+														targetFreeSpace);
+		}
 	}
 
 	/*
@@ -582,6 +617,129 @@ loop:
 	 */
 	needLock = !RELATION_IS_LOCAL(relation);
 
+#ifdef NEW_EXTEND
+	{
+#define MAX_BUFFERS 64
+		Buffer victim_buffers[MAX_BUFFERS];
+		BlockNumber firstBlock = InvalidBlockNumber;
+		BlockNumber firstBlockFSM = InvalidBlockNumber;
+		BlockNumber curBlock;
+		uint32 extend_by_pages;
+		uint32 no_fsm_pages;
+		uint32 waitcount;
+
+		extend_by_pages = num_pages;
+
+		/*
+		 * Multiply the number of pages to extend by the number of waiters. Do
+		 * this even if we're not using the FSM, as it does relieve
+		 * contention. Pages will be found via bistate->next_free.
+		 */
+		if (needLock)
+			waitcount = RelationExtensionLockWaiterCount(relation);
+		else
+			waitcount = 0;
+		extend_by_pages += extend_by_pages * waitcount;
+
+		/*
+		 * can't extend by more than MAX_BUFFERS, we need to pin them all
+		 * concurrently. FIXME: Need an NBuffers / MaxBackends type limit
+		 * here.
+		 */
+		extend_by_pages = Min(extend_by_pages, MAX_BUFFERS);
+
+		/*
+		 * How many of the extended pages not to enter into the FSM.
+		 *
+		 * Only enter pages that we don't need ourselves into the
+		 * FSM. Otherwise every other backend will immediately try to use the
+		 * pages this backend neds itself, causing unnecessary contention.
+		 *
+		 * Bulk extended pages are remembered in bistate->next_free_buffer. So
+		 * without a bistate we can't directly make use of them.
+		 *
+		 * Never enter the page returned into the FSM, we'll immediately use
+		 * it.
+		 */
+		if (num_pages > 1 && bistate == NULL)
+			no_fsm_pages = 1;
+		else
+			no_fsm_pages = num_pages;
+
+		if (bistate && bistate->current_buf != InvalidBuffer)
+		{
+			ReleaseBuffer(bistate->current_buf);
+			bistate->current_buf = InvalidBuffer;
+		}
+
+		firstBlock = BulkExtendRelationBuffered(relation,
+												NULL,
+												false,
+												relation->rd_rel->relpersistence,
+												MAIN_FORKNUM,
+												RBM_ZERO_AND_LOCK,
+												bistate ? bistate->strategy : NULL,
+												&extend_by_pages,
+												1,
+												victim_buffers);
+		/*
+		 * Relation is now extended. Make all but the first buffer available
+		 * to other backends.
+		 *
+		 * XXX: We don't necessarily need to release pin / update FSM while
+		 * holding the extension lock. But there are some advantages.
+		 */
+		curBlock = firstBlock;
+		for (uint32 i = 0; i < extend_by_pages; i++, curBlock++)
+		{
+			Assert(curBlock == BufferGetBlockNumber(victim_buffers[i]));
+			Assert(BlockNumberIsValid(curBlock));
+
+			/* don't release the pin on the page returned by this function */
+			if (i > 0)
+				ReleaseBuffer(victim_buffers[i]);
+
+			if (i >= no_fsm_pages && use_fsm)
+			{
+				if (firstBlockFSM == InvalidBlockNumber)
+					firstBlockFSM = curBlock;
+
+				RecordPageWithFreeSpace(relation,
+										curBlock,
+										BufferGetPageSize(victim_buffers[i]) - SizeOfPageHeaderData);
+			}
+		}
+
+		if (use_fsm && firstBlockFSM != InvalidBlockNumber)
+			FreeSpaceMapVacuumRange(relation, firstBlockFSM, firstBlock + num_pages);
+
+		if (bistate)
+		{
+			if (extend_by_pages > 1)
+			{
+				bistate->next_free = firstBlock + 1;
+				bistate->last_free = firstBlock + extend_by_pages - 1;
+			}
+			else
+			{
+				bistate->next_free = InvalidBlockNumber;
+				bistate->last_free = InvalidBlockNumber;
+			}
+		}
+
+		buffer = victim_buffers[0];
+		if (bistate)
+		{
+			IncrBufferRefCount(buffer);
+			bistate->current_buf = buffer;
+		}
+#if 0
+		ereport(LOG, errmsg("block start %u, size %zu, requested pages: %u, extend_by_pages: %d, waitcount: %d",
+							firstBlock, len, num_pages, extend_by_pages, waitcount),
+				errhidestmt(true), errhidecontext(true));
+#endif
+	}
+#else
 	/*
 	 * If we need the lock but are not able to acquire it immediately, we'll
 	 * consider extending the relation by multiple blocks at a time to manage
@@ -635,6 +793,7 @@ loop:
 	 */
 	if (needLock)
 		UnlockRelationForExtension(relation, ExclusiveLock);
+#endif
 
 	/*
 	 * We need to initialize the empty new page.  Double-check that it really
-- 
2.38.0

>From 8b8a853c378c8f409f8d173a54d1da0ba4d047d6 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sun, 23 Oct 2022 14:41:46 -0700
Subject: [PATCH v2 11/14] bufmgr: debug: Add PrintBuffer[Desc]

Useful for development. Perhaps we should polish these and keep them?
---
 src/include/storage/buf_internals.h |  3 ++
 src/backend/storage/buffer/bufmgr.c | 49 +++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 57800254d2d..b651838b61a 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -390,6 +390,9 @@ extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
 
+extern void PrintBuffer(Buffer buffer, const char *msg);
+extern void PrintBufferDesc(BufferDesc *buf_hdr, const char *msg);
+
 /* freelist.c */
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 									 uint32 *buf_state);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 361ebc3ae26..d3134cecf2d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3723,6 +3723,55 @@ DropDatabaseBuffers(Oid dbid)
  *		use only.
  * -----------------------------------------------------------------
  */
+
+#include "utils/memutils.h"
+
+void
+PrintBufferDesc(BufferDesc *buf_hdr, const char *msg)
+{
+	Buffer		buffer = BufferDescriptorGetBuffer(buf_hdr);
+	uint32		buf_state = pg_atomic_read_u32(&buf_hdr->state);
+	char	   *path = "";
+	BlockNumber blockno = InvalidBlockNumber;
+
+	CurrentMemoryContext->allowInCritSection = true;
+	if (buf_state & BM_TAG_VALID)
+	{
+		path = relpathbackend(BufTagGetRelFileLocator(&buf_hdr->tag),
+							  InvalidBackendId, BufTagGetForkNum(&buf_hdr->tag));
+		blockno = buf_hdr->tag.blockNum;
+	}
+
+	fprintf(stderr, "%d: [%u] msg: %s, rel: %s, block %u: refcount: %u / %u, usagecount: %u, flags:%s%s%s%s%s%s%s%s%s%s\n",
+			MyProcPid,
+			buffer,
+			msg,
+			path,
+			blockno,
+			BUF_STATE_GET_REFCOUNT(buf_state),
+			GetPrivateRefCount(buffer),
+			BUF_STATE_GET_USAGECOUNT(buf_state),
+			buf_state & BM_LOCKED ? " BM_LOCKED" : "",
+			buf_state & BM_DIRTY ? " BM_DIRTY" : "",
+			buf_state & BM_VALID ? " BM_VALID" : "",
+			buf_state & BM_TAG_VALID ? " BM_TAG_VALID" : "",
+			buf_state & BM_IO_IN_PROGRESS ? " BM_IO_IN_PROGRESS" : "",
+			buf_state & BM_IO_ERROR ? " BM_IO_ERROR" : "",
+			buf_state & BM_JUST_DIRTIED ? " BM_JUST_DIRTIED" : "",
+			buf_state & BM_PIN_COUNT_WAITER ? " BM_PIN_COUNT_WAITER" : "",
+			buf_state & BM_CHECKPOINT_NEEDED ? " BM_CHECKPOINT_NEEDED" : "",
+			buf_state & BM_PERMANENT ? " BM_PERMANENT" : ""
+		);
+}
+
+void
+PrintBuffer(Buffer buffer, const char *msg)
+{
+	BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
+
+	PrintBufferDesc(buf_hdr, msg);
+}
+
 #ifdef NOT_USED
 void
 PrintBufferDescs(void)
-- 
2.38.0

Re: refactoring relation extension and BufferAlloc(), faster COPY

Reply via email to