Hi,

On 2023-03-28 18:21:02 -0700, Andres Freund wrote:
> Here's a draft patch.

Attached is v2, with a stupid bug fixed and a bit of comment / pgindent
polish.

Greetings,

Andres Freund
>From 6f27018765f167a8d2a52bc7c9068cb09ca8d82c Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 24 Oct 2022 12:28:06 -0700
Subject: [PATCH v2 1/2] hio: Release extension lock before initializing page /
 pinning VM

PageInit() while holding the extension lock is unnecessary after 0d1fe9f74e3
started to use RBM_ZERO_AND_LOCK - nobody can look at the new page before we
release the page lock. PageInit() zeroes the page, which isn't that cheap, so
deferring it until after the extension lock is released seems like a good idea.

Doing visibilitymap_pin() while holding the extension lock, introduced in
7db0cd2145f2, looks like an accident. Due to the restrictions on
HEAP_INSERT_FROZEN it's unlikely to be a performance issue, but it still seems
better to move it out.
---
 src/backend/access/heap/hio.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index e152807d2dc..7479212d4e0 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -623,6 +623,13 @@ loop:
 	 */
 	buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
 
+	/*
+	 * Release the file-extension lock; it's now OK for someone else to extend
+	 * the relation some more.
+	 */
+	if (needLock)
+		UnlockRelationForExtension(relation, ExclusiveLock);
+
 	/*
 	 * We need to initialize the empty new page.  Double-check that it really
 	 * is empty (this should never happen, but if it does we don't want to
@@ -647,13 +654,6 @@ loop:
 		visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer);
 	}
 
-	/*
-	 * Release the file-extension lock; it's now OK for someone else to extend
-	 * the relation some more.
-	 */
-	if (needLock)
-		UnlockRelationForExtension(relation, ExclusiveLock);
-
 	/*
 	 * Lock the other buffer. It's guaranteed to be of a lower page number
 	 * than the new page. To conform with the deadlock prevent rules, we ought
-- 
2.38.0

>From 5a33df025ce466343d88bbc57ac1bc80d883a230 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Tue, 28 Mar 2023 18:17:11 -0700
Subject: [PATCH v2 2/2] WIP: relation extension: Don't pin the VM while
 holding buffer lock

---
 src/backend/access/heap/hio.c | 120 +++++++++++++++++++++++-----------
 1 file changed, 81 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 7479212d4e0..8b3dfa0ccae 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -134,14 +134,17 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock,
  * buffer2 may be InvalidBuffer, if only one buffer is involved.  buffer1
  * must not be InvalidBuffer.  If both buffers are specified, block1 must
  * be less than block2.
+ *
+ * Returns whether buffer locks were temporarily released.
  */
-static void
+static bool
 GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
 					 BlockNumber block1, BlockNumber block2,
 					 Buffer *vmbuffer1, Buffer *vmbuffer2)
 {
 	bool		need_to_pin_buffer1;
 	bool		need_to_pin_buffer2;
+	bool		released_locks = false;
 
 	Assert(BufferIsValid(buffer1));
 	Assert(buffer2 == InvalidBuffer || block1 <= block2);
@@ -155,9 +158,10 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
 			&& PageIsAllVisible(BufferGetPage(buffer2))
 			&& !visibilitymap_pin_ok(block2, *vmbuffer2);
 		if (!need_to_pin_buffer1 && !need_to_pin_buffer2)
-			return;
+			break;
 
 		/* We must unlock both buffers before doing any I/O. */
+		released_locks = true;
 		LockBuffer(buffer1, BUFFER_LOCK_UNLOCK);
 		if (buffer2 != InvalidBuffer && buffer2 != buffer1)
 			LockBuffer(buffer2, BUFFER_LOCK_UNLOCK);
@@ -183,6 +187,8 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
 			|| (need_to_pin_buffer1 && need_to_pin_buffer2))
 			break;
 	}
+
+	return released_locks;
 }
 
 /*
@@ -345,6 +351,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	BlockNumber targetBlock,
 				otherBlock;
 	bool		needLock;
+	bool		unlockedTargetBuffer;
 
 	len = MAXALIGN(len);		/* be conservative */
 
@@ -630,6 +637,9 @@ loop:
 	if (needLock)
 		UnlockRelationForExtension(relation, ExclusiveLock);
 
+	unlockedTargetBuffer = false;
+	targetBlock = BufferGetBlockNumber(buffer);
+
 	/*
 	 * We need to initialize the empty new page.  Double-check that it really
 	 * is empty (this should never happen, but if it does we don't want to
@@ -639,75 +649,107 @@ loop:
 
 	if (!PageIsNew(page))
 		elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
-			 BufferGetBlockNumber(buffer),
+			 targetBlock,
 			 RelationGetRelationName(relation));
 
 	PageInit(page, BufferGetPageSize(buffer), 0);
 	MarkBufferDirty(buffer);
 
 	/*
-	 * The page is empty, pin vmbuffer to set all_frozen bit.
+	 * The page is empty, pin vmbuffer to set all_frozen bit. We don't want to
+	 * do IO while the buffer is locked, so we unlock the page first if IO is
+	 * needed (necessitating checks below).
 	 */
 	if (options & HEAP_INSERT_FROZEN)
 	{
-		Assert(PageGetMaxOffsetNumber(BufferGetPage(buffer)) == 0);
-		visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer);
+		Assert(PageGetMaxOffsetNumber(page) == 0);
+
+		if (!visibilitymap_pin_ok(targetBlock, *vmbuffer))
+		{
+			unlockedTargetBuffer = true;
+			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+			visibilitymap_pin(relation, targetBlock, vmbuffer);
+		}
 	}
 
 	/*
-	 * Lock the other buffer. It's guaranteed to be of a lower page number
-	 * than the new page. To conform with the deadlock prevent rules, we ought
-	 * to lock otherBuffer first, but that would give other backends a chance
-	 * to put tuples on our page. To reduce the likelihood of that, attempt to
-	 * lock the other buffer conditionally, that's very likely to work.
-	 * Otherwise we need to lock buffers in the correct order, and retry if
-	 * the space has been used in the mean time.
+	 * If we unlocked the target buffer above, it's unlikely, but possible,
+	 * that another backend used space on this page.
+	 *
+	 * If we didn't, and otherBuffer is valid, we need to lock the other
+	 * buffer. It's guaranteed to be of a lower page number than the new page.
+	 * To conform with the deadlock prevent rules, we ought to lock
+	 * otherBuffer first, but that would give other backends a chance to put
+	 * tuples on our page. To reduce the likelihood of that, attempt to lock
+	 * the other buffer conditionally, that's very likely to work.  Otherwise
+	 * we need to lock buffers in the correct order, and retry if the space
+	 * has been used in the mean time.
 	 *
 	 * Alternatively, we could acquire the lock on otherBuffer before
 	 * extending the relation, but that'd require holding the lock while
 	 * performing IO, which seems worse than an unlikely retry.
 	 */
-	if (otherBuffer != InvalidBuffer)
+	if (unlockedTargetBuffer)
+	{
+		if (otherBuffer != InvalidBuffer)
+			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+	}
+	else if (otherBuffer != InvalidBuffer)
 	{
 		Assert(otherBuffer != buffer);
-		targetBlock = BufferGetBlockNumber(buffer);
 		Assert(targetBlock > otherBlock);
 
 		if (unlikely(!ConditionalLockBuffer(otherBuffer)))
 		{
+			unlockedTargetBuffer = true;
 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		}
+	}
 
-		/*
-		 * Because the buffers were unlocked for a while, it's possible,
-		 * although unlikely, that an all-visible flag became set or that
-		 * somebody used up the available space in the new page.  We can use
-		 * GetVisibilityMapPins to deal with the first case.  In the second
-		 * case, just retry from start.
-		 */
-		GetVisibilityMapPins(relation, otherBuffer, buffer,
-							 otherBlock, targetBlock, vmbuffer_other,
-							 vmbuffer);
-
-		/*
-		 * Note that we have to check the available space even if our
-		 * conditional lock succeeded, because GetVisibilityMapPins might've
-		 * transiently released lock on the target buffer to acquire a VM pin
-		 * for the otherBuffer.
-		 */
-		if (len > PageGetHeapFreeSpace(page))
+	/*
+	 * If one of the buffers was unlocked, it's possible, although unlikely,
+	 * that an all-visible flag became set.  We can use GetVisibilityMapPins
+	 * to deal with that.
+	 */
+	if (unlockedTargetBuffer || otherBuffer != InvalidBuffer)
+	{
+		if (otherBuffer != InvalidBuffer)
 		{
-			LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
+			if (GetVisibilityMapPins(relation, otherBuffer, buffer,
+									 otherBlock, targetBlock, vmbuffer_other,
+									 vmbuffer))
+				unlockedTargetBuffer = true;
+		}
+		else
+		{
+			if (GetVisibilityMapPins(relation, buffer, InvalidBuffer,
+									 targetBlock, InvalidBlockNumber,
+									 vmbuffer, InvalidBuffer))
+				unlockedTargetBuffer = true;
+		}
+	}
+
+	/*
+	 * If the target buffer was temporarily unlocked since the relation
+	 * extension, it's possible, although unlikely, that all the space on the
+	 * page was already used. If so, we just retry from the start.  If we
+	 * didn't unlock, something has gone wrong if there's not enough space -
+	 * the test at the top should have prevented reaching this case.
+	 */
+	pageFreeSpace = PageGetHeapFreeSpace(page);
+	if (len > pageFreeSpace)
+	{
+		if (unlockedTargetBuffer)
+		{
+			if (otherBuffer != InvalidBuffer)
+				LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
 			UnlockReleaseBuffer(buffer);
 
 			goto loop;
 		}
-	}
-	else if (len > PageGetHeapFreeSpace(page))
-	{
-		/* We should not get here given the test at the top */
 		elog(PANIC, "tuple is too big: size %zu", len);
 	}
 
@@ -720,7 +762,7 @@ loop:
 	 * current backend to make more insertions or not, which is probably a
 	 * good bet most of the time.  So for now, don't add it to FSM yet.
 	 */
-	RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
+	RelationSetTargetBlock(relation, targetBlock);
 
 	return buffer;
 }
-- 
2.38.0

Reply via email to