Hello, Matthias!

Updated patches attached.

Changes:
* cleanup test logic a little bit
* resolve issue with rescan in GIST (item->blkno == InvalidBlockNumber)
* move test to the main isolation suite
* add test for SpGist
* update comment I mentioned before
* allow GIST to set LP_DEAD in cases it is a safe even if LSN is updated

Also, seems like SP-GIST version is broken, it fails like this:

TRAP: failed Assert("BufferIsValid(so->pagePin)"), File:
"../src/backend/access/spgist/spgscan.c", Line: 1139, PID: 612214
FETCH(ExceptionalCondition+0xbe)[0x644a0b9dfdbc]
FETCH(spggettuple+0x289)[0x644a0b3743c6]
FETCH(index_getnext_tid+0x166)[0x644a0b3382f7]
FETCH(+0x3b392b)[0x644a0b56d92b]
FETCH(+0x3887df)[0x644a0b5427df]
FETCH(ExecScan+0x77)[0x644a0b542858]
FETCH(+0x3b3b9b)[0x644a0b56db9b]
FETCH(+0x376d8b)[0x644a0b530d8b]
FETCH(+0x379bd9)[0x644a0b533bd9]
FETCH(standard_ExecutorRun+0x19f)[0x644a0b531457]
FETCH(ExecutorRun+0x5a)[0x644a0b5312b5]
FETCH(+0x6276dc)[0x644a0b7e16dc]
FETCH(+0x628936)[0x644a0b7e2936]
FETCH(PortalRunFetch+0x1a0)[0x644a0b7e229c]
FETCH(PerformPortalFetch+0x13b)[0x644a0b49d7e5]
FETCH(standard_ProcessUtility+0x5f0)[0x644a0b7e3aab]
FETCH(ProcessUtility+0x140)[0x644a0b7e34b4]
FETCH(+0x627ceb)[0x644a0b7e1ceb]
FETCH(+0x627a28)[0x644a0b7e1a28]
FETCH(PortalRun+0x273)[0x644a0b7e12bb]
FETCH(+0x61fae1)[0x644a0b7d9ae1]
FETCH(PostgresMain+0x9eb)[0x644a0b7df170]
FETCH(+0x61b3e2)[0x644a0b7d53e2]
FETCH(postmaster_child_launch+0x137)[0x644a0b6e6e2d]
FETCH(+0x53384b)[0x644a0b6ed84b]
FETCH(+0x530f31)[0x644a0b6eaf31]
FETCH(PostmasterMain+0x161f)[0x644a0b6ea812]
FETCH(main+0x3a1)[0x644a0b5c29cf]

Best regards,
Mikhail.

>
From 59b7746c96cca144c1d1d0362a96d8aa019e2a23 Mon Sep 17 00:00:00 2001
From: nkey <n...@toloka.ai>
Date: Fri, 10 Jan 2025 17:55:30 +0100
Subject: [PATCH v5 2/3] RFC: Extend buffer pinning for GIST IOS

This should fix issues with incorrect results when a GIST IOS encounters tuples removed from pages by a concurrent vacuum operation.

Also, add ability to set LP_DEAD bits in more cases of IOS scans overs GIST.
---
 src/backend/access/gist/README       |  16 ++++
 src/backend/access/gist/gistget.c    |  46 +++++++++--
 src/backend/access/gist/gistscan.c   | 115 ++++++++++++++++++++++++---
 src/backend/access/gist/gistvacuum.c |   6 +-
 src/include/access/gist_private.h    |   2 +
 5 files changed, 166 insertions(+), 19 deletions(-)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 8015ff19f05..c7c2afad088 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -287,6 +287,22 @@ would complicate the insertion algorithm. So when an insertion sees a page
 with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
 crashed in the middle to completion by adding the downlink in the parent.
 
+Index-only scans and VACUUM
+---------------------------
+
+Index-only scans require that any tuple returned by the index scan has not
+been removed from the index by a call to ambulkdelete through VACUUM.
+To ensure this invariant, bulkdelete now requires a buffer cleanup lock, and
+every Index-only scan (IOS) will keep a pin on each page that it is returning
+tuples from.  For ordered scans, we keep one pin for each matching leaf tuple,
+for unordered scans we just keep an additional pin while we're still working
+on the page's tuples.  This ensures that pages seen by the scan won't be
+cleaned up until after the tuples have been returned.
+
+These longer pin lifetimes can cause buffer exhaustion with messages like "no
+unpinned buffers available" when the index has many pages that have similar
+ordering; but future work can figure out how to best work that out.
+
 Buffering build algorithm
 -------------------------
 
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index cc40e928e0a..adf86fed67b 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -35,7 +35,7 @@
  * away and the TID was re-used by a completely different heap tuple.
  */
 static void
-gistkillitems(IndexScanDesc scan)
+gistkillitems(IndexScanDesc scan, bool pagePinned)
 {
 	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
 	Buffer		buffer;
@@ -60,9 +60,10 @@ gistkillitems(IndexScanDesc scan)
 	/*
 	 * If page LSN differs it means that the page was modified since the last
 	 * read. killedItems could be not valid so LP_DEAD hints applying is not
-	 * safe.
+	 * safe. But in case then page was pinned - it is safe, because VACUUM is
+	 * unable to remove tuples due to locking protocol.
 	 */
-	if (BufferGetLSNAtomic(buffer) != so->curPageLSN)
+	if (!pagePinned && BufferGetLSNAtomic(buffer) != so->curPageLSN)
 	{
 		UnlockReleaseBuffer(buffer);
 		so->numKilled = 0;		/* reset counter */
@@ -395,6 +396,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	}
 
 	so->nPageData = so->curPageData = 0;
+	Assert(so->pagePin == InvalidBuffer);
 	scan->xs_hitup = NULL;		/* might point into pageDataCxt */
 	if (so->pageDataCxt)
 		MemoryContextReset(so->pageDataCxt);
@@ -402,7 +404,8 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	/*
 	 * We save the LSN of the page as we read it, so that we know whether it
 	 * safe to apply LP_DEAD hints to the page later. This allows us to drop
-	 * the pin for MVCC scans, which allows vacuum to avoid blocking.
+	 * the pin for MVCC scans (except index-only scans), which allows vacuum
+	 * to avoid blocking.
 	 */
 	so->curPageLSN = BufferGetLSNAtomic(buffer);
 
@@ -460,6 +463,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 			so->pageData[so->nPageData].heapPtr = it->t_tid;
 			so->pageData[so->nPageData].recheck = recheck;
 			so->pageData[so->nPageData].offnum = i;
+			so->pageData[so->nPageData].buffer = InvalidBuffer;
 
 			/*
 			 * In an index-only scan, also fetch the data from the tuple.  The
@@ -471,7 +475,18 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 				so->pageData[so->nPageData].recontup =
 					gistFetchTuple(giststate, r, it);
 				MemoryContextSwitchTo(oldcxt);
+
+				/*
+				 * Only maintain a single additional buffer pin for unordered
+				 * IOS scans; as we have all data already in one place.
+				 */
+				if (so->nPageData == 0)
+				{
+					so->pagePin = buffer;
+					IncrBufferRefCount(buffer);
+				}
 			}
+
 			so->nPageData++;
 		}
 		else
@@ -501,7 +516,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 				 * In an index-only scan, also fetch the data from the tuple.
 				 */
 				if (scan->xs_want_itup)
+				{
 					item->data.heap.recontup = gistFetchTuple(giststate, r, it);
+					item->data.heap.buffer = buffer;
+					IncrBufferRefCount(buffer);
+				}
 			}
 			else
 			{
@@ -567,6 +586,10 @@ getNextNearest(IndexScanDesc scan)
 		/* free previously returned tuple */
 		pfree(scan->xs_hitup);
 		scan->xs_hitup = NULL;
+
+		Assert(BufferIsValid(so->pagePin));
+		ReleaseBuffer(so->pagePin);
+		so->pagePin = InvalidBuffer;
 	}
 
 	do
@@ -588,7 +611,11 @@ getNextNearest(IndexScanDesc scan)
 
 			/* in an index-only scan, also return the reconstructed tuple. */
 			if (scan->xs_want_itup)
+			{
+				Assert(BufferIsValid(item->data.heap.buffer));
 				scan->xs_hitup = item->data.heap.recontup;
+				so->pagePin = item->data.heap.buffer;
+			}
 			res = true;
 		}
 		else
@@ -688,7 +715,6 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 				&& so->curPageData > 0
 				&& so->curPageData == so->nPageData)
 			{
-
 				if (so->killedItems == NULL)
 				{
 					MemoryContext oldCxt =
@@ -704,13 +730,21 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 					so->killedItems[so->numKilled++] =
 						so->pageData[so->curPageData - 1].offnum;
 			}
+
+			if (scan->xs_want_itup && so->nPageData > 0)
+			{
+				Assert(BufferIsValid(so->pagePin));
+				ReleaseBuffer(so->pagePin);
+				so->pagePin = InvalidBuffer;
+			}
+
 			/* find and process the next index page */
 			do
 			{
 				GISTSearchItem *item;
 
 				if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0))
-					gistkillitems(scan);
+					gistkillitems(scan, BufferIsValid(so->pagePin));
 
 				item = getNextGISTSearchItem(so);
 
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 700fa959d03..932c2271510 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -110,6 +110,7 @@ gistbeginscan(Relation r, int nkeys, int norderbys)
 	so->numKilled = 0;
 	so->curBlkno = InvalidBlockNumber;
 	so->curPageLSN = InvalidXLogRecPtr;
+	so->pagePin = InvalidBuffer;
 
 	scan->opaque = so;
 
@@ -151,18 +152,73 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 		Assert(so->queueCxt == so->giststate->scanCxt);
 		first_time = true;
 	}
-	else if (so->queueCxt == so->giststate->scanCxt)
-	{
-		/* second time through */
-		so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
-											 "GiST queue context",
-											 ALLOCSET_DEFAULT_SIZES);
-		first_time = false;
-	}
 	else
 	{
-		/* third or later time through */
-		MemoryContextReset(so->queueCxt);
+		/*
+		 * In the first scan of a query we allocate IOS items in the scan
+		 * context, which is never reset. To not leak this memory, we
+		 * manually free the queue entries.
+		 */
+		const bool freequeue = so->queueCxt == so->giststate->scanCxt;
+		/*
+		 * Index-only scans require that vacuum can't clean up entries that
+		 * we're still planning to return, so we hold a pin on the buffer until
+		 * we're past the returned item (1 pin count for every index tuple).
+		 * When rescan is called, however, we need to clean up the pins that
+		 * we still hold, lest we leak them and lose a buffer entry to that
+		 * page.
+		 */
+		const bool unpinqueue = scan->xs_want_itup;
+
+		if (freequeue || unpinqueue)
+		{
+			while (!pairingheap_is_empty(so->queue))
+			{
+				pairingheap_node *node;
+				GISTSearchItem *item;
+
+				node = pairingheap_remove_first(so->queue);
+				item = pairingheap_container(GISTSearchItem, phNode, node);
+
+				/*
+				 * If we need to unpin a buffer for IOS' heap items, do so
+				 * now.
+				 */
+				if (unpinqueue && item->blkno == InvalidBlockNumber)
+				{
+					Assert(BufferIsValid(item->data.heap.buffer));
+					ReleaseBuffer(item->data.heap.buffer);
+				}
+
+				/*
+				 * item->data.heap.recontup is stored in the separate memory
+				 * context so->pageDataCxt, which is always reset; so we don't
+				 * need to free that.
+				 * "item" itself is allocated into the queue context, which is
+				 * generally reset in rescan.
+				 * However, only in the first scan, we allocate these items
+				 * into the main scan context, which isn't reset; so we must
+				 * free these items, or else we'd leak the memory for the
+				 * duration of the query.
+				 */
+				if (freequeue)
+					pfree(item);
+			}
+		}
+
+		if (so->queueCxt == so->giststate->scanCxt)
+		{
+			/* second time through */
+			so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt,
+												 "GiST queue context",
+												 ALLOCSET_DEFAULT_SIZES);
+		}
+		else
+		{
+			/* third or later time through */
+			MemoryContextReset(so->queueCxt);
+		}
+
 		first_time = false;
 	}
 
@@ -341,6 +397,15 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys,
 
 	/* any previous xs_hitup will have been pfree'd in context resets above */
 	scan->xs_hitup = NULL;
+
+	if (scan->xs_want_itup)
+	{
+		if (BufferIsValid(so->pagePin))
+		{
+			ReleaseBuffer(so->pagePin);
+			so->pagePin = InvalidBuffer;
+		}
+	}
 }
 
 void
@@ -348,6 +413,36 @@ gistendscan(IndexScanDesc scan)
 {
 	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
 
+	if (scan->xs_want_itup)
+	{
+		if (BufferIsValid(so->pagePin))
+		{
+			ReleaseBuffer(so->pagePin);
+			so->pagePin = InvalidBuffer;
+		}
+
+		/* unpin any leftover buffers */
+		while (!pairingheap_is_empty(so->queue))
+		{
+			pairingheap_node *node;
+			GISTSearchItem *item;
+
+			/*
+			 * Note: unlike gistrescan, there is no need to actually free the
+			 * items here, as that's handled by memory context reset in the
+			 * call to freeGISTstate() below.
+			 */
+			node = pairingheap_remove_first(so->queue);
+			item = pairingheap_container(GISTSearchItem, phNode, node);
+
+			if (item->blkno == InvalidBlockNumber)
+			{
+				Assert(BufferIsValid(item->data.heap.buffer));
+				ReleaseBuffer(item->data.heap.buffer);
+			}
+		}
+	}
+
 	/*
 	 * freeGISTstate is enough to clean up everything made by gistbeginscan,
 	 * as well as the queueCxt if there is a separate context for it.
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index fe0bfb781ca..840b3d586ed 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -289,10 +289,10 @@ restart:
 								info->strategy);
 
 	/*
-	 * We are not going to stay here for a long time, aggressively grab an
-	 * exclusive lock.
+	 * We are not going to stay here for a long time, aggressively grab a
+	 * cleanup lock.
 	 */
-	LockBuffer(buffer, GIST_EXCLUSIVE);
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (gistPageRecyclable(page))
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 39404ec7cdb..e559117e7d7 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -124,6 +124,7 @@ typedef struct GISTSearchHeapItem
 								 * index-only scans */
 	OffsetNumber offnum;		/* track offset in page to mark tuple as
 								 * LP_DEAD */
+	Buffer		buffer;			/* buffer to unpin, when IOS */
 } GISTSearchHeapItem;
 
 /* Unvisited item, either index page or heap tuple */
@@ -176,6 +177,7 @@ typedef struct GISTScanOpaqueData
 	OffsetNumber curPageData;	/* next item to return */
 	MemoryContext pageDataCxt;	/* context holding the fetched tuples, for
 								 * index-only scans */
+	Buffer		pagePin;		/* buffer of page, if pinned */
 } GISTScanOpaqueData;
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
-- 
2.43.0

From 5423affdba594ca0c1575f4f35c6ec479f82b216 Mon Sep 17 00:00:00 2001
From: nkey <n...@toloka.ai>
Date: Fri, 10 Jan 2025 16:22:29 +0100
Subject: [PATCH v5 1/3] isolation tester showing broken index-only scans with
 GiST and SP-GiST

Co-authored-by: Matthias van de Meent <boekewurm+postg...@gmail.com>, Michail Nikolaev <michail.nikol...@gmail.com>
---
 .../expected/index-only-scan-gist-vacuum.out  |  67 +++++++++++
 .../index-only-scan-spgist-vacuum.out         |  67 +++++++++++
 src/test/isolation/isolation_schedule         |   2 +
 .../specs/index-only-scan-gist-vacuum.spec    | 113 ++++++++++++++++++
 .../specs/index-only-scan-spgist-vacuum.spec  | 113 ++++++++++++++++++
 5 files changed, 362 insertions(+)
 create mode 100644 src/test/isolation/expected/index-only-scan-gist-vacuum.out
 create mode 100644 src/test/isolation/expected/index-only-scan-spgist-vacuum.out
 create mode 100644 src/test/isolation/specs/index-only-scan-gist-vacuum.spec
 create mode 100644 src/test/isolation/specs/index-only-scan-spgist-vacuum.spec

diff --git a/src/test/isolation/expected/index-only-scan-gist-vacuum.out b/src/test/isolation/expected/index-only-scan-gist-vacuum.out
new file mode 100644
index 00000000000..19117402f52
--- /dev/null
+++ b/src/test/isolation/expected/index-only-scan-gist-vacuum.out
@@ -0,0 +1,67 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod: 
+  DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_sorted: 
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+
+step s1_fetch_1: 
+	FETCH FROM foo;
+
+                 x
+------------------
+1.4142135623730951
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all: 
+	SELECT pg_sleep_for(INTERVAL '50ms');
+	FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+            
+(1 row)
+
+x
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
+
+starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod: 
+  DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_unsorted: 
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+
+step s1_fetch_1: 
+	FETCH FROM foo;
+
+a    
+-----
+(1,1)
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all: 
+	SELECT pg_sleep_for(INTERVAL '50ms');
+	FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+            
+(1 row)
+
+a
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
diff --git a/src/test/isolation/expected/index-only-scan-spgist-vacuum.out b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out
new file mode 100644
index 00000000000..19117402f52
--- /dev/null
+++ b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out
@@ -0,0 +1,67 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod: 
+  DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_sorted: 
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+
+step s1_fetch_1: 
+	FETCH FROM foo;
+
+                 x
+------------------
+1.4142135623730951
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all: 
+	SELECT pg_sleep_for(INTERVAL '50ms');
+	FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+            
+(1 row)
+
+x
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
+
+starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit
+step s2_mod: 
+  DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+
+step s1_begin: BEGIN;
+step s1_prepare_unsorted: 
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+
+step s1_fetch_1: 
+	FETCH FROM foo;
+
+a    
+-----
+(1,1)
+(1 row)
+
+step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; <waiting ...>
+step s1_fetch_all: 
+	SELECT pg_sleep_for(INTERVAL '50ms');
+	FETCH ALL FROM foo;
+
+pg_sleep_for
+------------
+            
+(1 row)
+
+a
+-
+(0 rows)
+
+step s2_vacuum: <... completed>
+step s1_commit: COMMIT;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 143109aa4da..9720c9a2dc8 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -17,6 +17,8 @@ test: partial-index
 test: two-ids
 test: multiple-row-versions
 test: index-only-scan
+test: index-only-scan-gist-vacuum
+test: index-only-scan-spgist-vacuum
 test: predicate-lock-hot-tuple
 test: update-conflict-out
 test: deadlock-simple
diff --git a/src/test/isolation/specs/index-only-scan-gist-vacuum.spec b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec
new file mode 100644
index 00000000000..b1688d44fa7
--- /dev/null
+++ b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec
@@ -0,0 +1,113 @@
+# index-only-scan test showing wrong results with GiST
+#
+setup
+{
+	-- by using a low fillfactor and a wide tuple we can get multiple blocks
+	-- with just few rows
+	CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '')
+	WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10);
+
+	INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i);
+
+	CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING gist(a);
+}
+setup
+{
+	VACUUM ios_needs_cleanup_lock;
+}
+
+teardown
+{
+	DROP TABLE ios_needs_cleanup_lock;
+}
+
+
+session s1
+
+# Force an index-only scan, where possible:
+setup {
+	SET enable_bitmapscan = false;
+	SET enable_indexonlyscan = true;
+	SET enable_indexscan = true;
+}
+
+step s1_begin { BEGIN; }
+step s1_commit { COMMIT; }
+
+step s1_prepare_sorted {
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+}
+
+step s1_prepare_unsorted {
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+}
+
+step s1_fetch_1 {
+	FETCH FROM foo;
+}
+
+step s1_fetch_all {
+	SELECT pg_sleep_for(INTERVAL '50ms');
+	FETCH ALL FROM foo;
+}
+
+
+session s2
+
+# Don't delete row 1 so we have a row for the cursor to "rest" on.
+step s2_mod
+{
+  DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+}
+
+# Disable truncation, as otherwise we'll just wait for a timeout while trying
+# to acquire the lock
+step s2_vacuum  { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; }
+
+permutation
+  # delete nearly all rows, to make issue visible
+  s2_mod
+  # create a cursor
+  s1_begin
+  s1_prepare_sorted
+
+  # fetch one row from the cursor, that ensures the index scan portion is done
+  # before the vacuum in the next step
+  s1_fetch_1
+
+  # with the bug this vacuum will mark pages as all-visible that the scan in
+  # the next step then considers all-visible, despite all rows from those
+  # pages having been removed.
+  # Because this should block on buffer-level locks, this won't ever be
+  # considered "blocked" by isolation tester, and so we only have a single
+  # step we can work with concurrently.
+  s2_vacuum (*)
+
+  # if this returns any rows, we're busted
+  s1_fetch_all
+
+  s1_commit
+
+permutation
+  # delete nearly all rows, to make issue visible
+  s2_mod
+  # create a cursor
+  s1_begin
+  s1_prepare_unsorted
+
+  # fetch one row from the cursor, that ensures the index scan portion is done
+  # before the vacuum in the next step
+  s1_fetch_1
+
+  # with the bug this vacuum will mark pages as all-visible that the scan in
+  # the next step then considers all-visible, despite all rows from those
+  # pages having been removed.
+  # Because this should block on buffer-level locks, this won't ever be
+  # considered "blocked" by isolation tester, and so we only have a single
+  # step we can work with concurrently.
+  s2_vacuum (*)
+
+  # if this returns any rows, we're busted
+  s1_fetch_all
+
+  s1_commit
diff --git a/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec
new file mode 100644
index 00000000000..b414c5d1695
--- /dev/null
+++ b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec
@@ -0,0 +1,113 @@
+# index-only-scan test showing wrong results with SPGiST
+#
+setup
+{
+	-- by using a low fillfactor and a wide tuple we can get multiple blocks
+	-- with just few rows
+	CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '')
+	WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10);
+
+	INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i);
+
+	CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING spgist(a);
+}
+setup
+{
+	VACUUM ios_needs_cleanup_lock;
+}
+
+teardown
+{
+	DROP TABLE ios_needs_cleanup_lock;
+}
+
+
+session s1
+
+# Force an index-only scan, where possible:
+setup {
+	SET enable_bitmapscan = false;
+	SET enable_indexonlyscan = true;
+	SET enable_indexscan = true;
+}
+
+step s1_begin { BEGIN; }
+step s1_commit { COMMIT; }
+
+step s1_prepare_sorted {
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)';
+}
+
+step s1_prepare_unsorted {
+	DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a;
+}
+
+step s1_fetch_1 {
+	FETCH FROM foo;
+}
+
+step s1_fetch_all {
+	SELECT pg_sleep_for(INTERVAL '50ms');
+	FETCH ALL FROM foo;
+}
+
+
+session s2
+
+# Don't delete row 1 so we have a row for the cursor to "rest" on.
+step s2_mod
+{
+  DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)';
+}
+
+# Disable truncation, as otherwise we'll just wait for a timeout while trying
+# to acquire the lock
+step s2_vacuum  { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; }
+
+permutation
+  # delete nearly all rows, to make issue visible
+  s2_mod
+  # create a cursor
+  s1_begin
+  s1_prepare_sorted
+
+  # fetch one row from the cursor, that ensures the index scan portion is done
+  # before the vacuum in the next step
+  s1_fetch_1
+
+  # with the bug this vacuum will mark pages as all-visible that the scan in
+  # the next step then considers all-visible, despite all rows from those
+  # pages having been removed.
+  # Because this should block on buffer-level locks, this won't ever be
+  # considered "blocked" by isolation tester, and so we only have a single
+  # step we can work with concurrently.
+  s2_vacuum (*)
+
+  # if this returns any rows, we're busted
+  s1_fetch_all
+
+  s1_commit
+
+permutation
+  # delete nearly all rows, to make issue visible
+  s2_mod
+  # create a cursor
+  s1_begin
+  s1_prepare_unsorted
+
+  # fetch one row from the cursor, that ensures the index scan portion is done
+  # before the vacuum in the next step
+  s1_fetch_1
+
+  # with the bug this vacuum will mark pages as all-visible that the scan in
+  # the next step then considers all-visible, despite all rows from those
+  # pages having been removed.
+  # Because this should block on buffer-level locks, this won't ever be
+  # considered "blocked" by isolation tester, and so we only have a single
+  # step we can work with concurrently.
+  s2_vacuum (*)
+
+  # if this returns any rows, we're busted
+  s1_fetch_all
+
+  s1_commit
-- 
2.43.0

From bf5e033d291a9dcfdd95ca95d576f5b40a8d34c2 Mon Sep 17 00:00:00 2001
From: nkey <n...@toloka.ai>
Date: Fri, 10 Jan 2025 18:00:49 +0100
Subject: [PATCH v5 3/3] RFC: Extend buffer pinning for SP-GIST IOS

This should fix issues with incorrect results when a SP-GIST
IOS encounters tuples removed from pages by a concurrent vacuum
operation.
---
 src/backend/access/spgist/spgscan.c   | 112 ++++++++++++++++++++++----
 src/backend/access/spgist/spgvacuum.c |   2 +-
 src/include/access/spgist_private.h   |   3 +
 3 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
index 986362a777f..32e6a0a8a03 100644
--- a/src/backend/access/spgist/spgscan.c
+++ b/src/backend/access/spgist/spgscan.c
@@ -30,7 +30,8 @@
 typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr,
 							   Datum leafValue, bool isNull,
 							   SpGistLeafTuple leafTuple, bool recheck,
-							   bool recheckDistances, double *distances);
+							   bool recheckDistances, double *distances,
+							   Buffer pin);
 
 /*
  * Pairing heap comparison function for the SpGistSearchItem queue.
@@ -300,6 +301,38 @@ spgPrepareScanKeys(IndexScanDesc scan)
 	}
 }
 
+/*
+ * Note: This removes all items from the pairingheap.
+ */
+static void
+spgScanEndDropAllPagePins(IndexScanDesc scan, SpGistScanOpaque so)
+{
+	/* Guaranteed no pinned pages */
+	if (so->scanQueue == NULL || !scan->xs_want_itup)
+		return;
+
+	if (so->nPtrs > 0)
+	{
+		Assert(BufferIsValid(so->pagePin));
+		ReleaseBuffer(so->pagePin);
+		so->pagePin = InvalidBuffer;
+	}
+
+	while (!pairingheap_is_empty(so->scanQueue))
+	{
+		pairingheap_node *node;
+		SpGistSearchItem *item;
+
+		node = pairingheap_remove_first(so->scanQueue);
+		item = pairingheap_container(SpGistSearchItem, phNode, node);
+		if (!item->isLeaf)
+			continue;
+
+		Assert(BufferIsValid(item->buffer));
+		ReleaseBuffer(item->buffer);
+	}
+}
+
 IndexScanDesc
 spgbeginscan(Relation rel, int keysz, int orderbysz)
 {
@@ -416,6 +449,9 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	/* preprocess scankeys, set up the representation in *so */
 	spgPrepareScanKeys(scan);
 
+	/* release any pinned buffers from earlier rescans */
+	spgScanEndDropAllPagePins(scan, so);
+
 	/* set up starting queue entries */
 	resetSpGistScanOpaque(so);
 
@@ -428,6 +464,12 @@ spgendscan(IndexScanDesc scan)
 {
 	SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
 
+	/*
+	 * release any pinned buffers from earlier rescans, before we drop their
+	 * data by dropping the memory contexts.
+	 */
+	spgScanEndDropAllPagePins(scan, so);
+
 	MemoryContextDelete(so->tempCxt);
 	MemoryContextDelete(so->traversalCxt);
 
@@ -460,7 +502,7 @@ spgendscan(IndexScanDesc scan)
 static SpGistSearchItem *
 spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
 			   Datum leafValue, bool recheck, bool recheckDistances,
-			   bool isnull, double *distances)
+			   bool isnull, double *distances, Buffer addPin)
 {
 	SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances);
 
@@ -479,6 +521,10 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
 			datumCopy(leafValue, so->state.attType.attbyval,
 					  so->state.attType.attlen);
 
+		Assert(BufferIsValid(addPin));
+		IncrBufferRefCount(addPin);
+		item->buffer = addPin;
+
 		/*
 		 * If we're going to need to reconstruct INCLUDE attributes, store the
 		 * whole leaf tuple so we can get the INCLUDE attributes out of it.
@@ -495,6 +541,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
 	{
 		item->value = (Datum) 0;
 		item->leafTuple = NULL;
+		item->buffer = InvalidBuffer;
 	}
 	item->traversalValue = NULL;
 	item->isLeaf = true;
@@ -513,7 +560,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple,
 static bool
 spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item,
 			SpGistLeafTuple leafTuple, bool isnull,
-			bool *reportedSome, storeRes_func storeRes)
+			bool *reportedSome, storeRes_func storeRes, Buffer buffer)
 {
 	Datum		leafValue;
 	double	   *distances;
@@ -580,7 +627,8 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item,
 														recheck,
 														recheckDistances,
 														isnull,
-														distances);
+														distances,
+														buffer);
 
 			spgAddSearchItemToQueue(so, heapItem);
 
@@ -591,7 +639,7 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item,
 			/* non-ordered scan, so report the item right away */
 			Assert(!recheckDistances);
 			storeRes(so, &leafTuple->heapPtr, leafValue, isnull,
-					 leafTuple, recheck, false, NULL);
+					 leafTuple, recheck, false, NULL, InvalidBuffer);
 			*reportedSome = true;
 		}
 	}
@@ -760,7 +808,7 @@ enum SpGistSpecialOffsetNumbers
 static OffsetNumber
 spgTestLeafTuple(SpGistScanOpaque so,
 				 SpGistSearchItem *item,
-				 Page page, OffsetNumber offset,
+				 Page page, OffsetNumber offset, Buffer buffer,
 				 bool isnull, bool isroot,
 				 bool *reportedSome,
 				 storeRes_func storeRes)
@@ -799,7 +847,8 @@ spgTestLeafTuple(SpGistScanOpaque so,
 
 	Assert(ItemPointerIsValid(&leafTuple->heapPtr));
 
-	spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes);
+	spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes,
+				buffer);
 
 	return SGLT_GET_NEXTOFFSET(leafTuple);
 }
@@ -835,7 +884,8 @@ redirect:
 			Assert(so->numberOfNonNullOrderBys > 0);
 			storeRes(so, &item->heapPtr, item->value, item->isNull,
 					 item->leafTuple, item->recheck,
-					 item->recheckDistances, item->distances);
+					 item->recheckDistances, item->distances,
+					 item->buffer);
 			reportedSome = true;
 		}
 		else
@@ -873,7 +923,7 @@ redirect:
 					/* When root is a leaf, examine all its tuples */
 					for (offset = FirstOffsetNumber; offset <= max; offset++)
 						(void) spgTestLeafTuple(so, item, page, offset,
-												isnull, true,
+												buffer, isnull, true,
 												&reportedSome, storeRes);
 				}
 				else
@@ -883,10 +933,24 @@ redirect:
 					{
 						Assert(offset >= FirstOffsetNumber && offset <= max);
 						offset = spgTestLeafTuple(so, item, page, offset,
-												  isnull, false,
+												  buffer, isnull, false,
 												  &reportedSome, storeRes);
 						if (offset == SpGistRedirectOffsetNumber)
+						{
+							Assert(so->nPtrs == 0);
 							goto redirect;
+						}
+					}
+
+					/*
+					 * IOS: Make sure we have one additional pin on the buffer,
+					 * so that vacuum won't remove any deleted TIDs and mark
+					 * their pages ALL_VISIBLE while we still have a copy.
+					 */
+					if (so->want_itup && reportedSome)
+					{
+						IncrBufferRefCount(buffer);
+						so->pagePin = buffer;
 					}
 				}
 			}
@@ -929,9 +993,10 @@ static void
 storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr,
 			Datum leafValue, bool isnull,
 			SpGistLeafTuple leafTuple, bool recheck,
-			bool recheckDistances, double *distances)
+			bool recheckDistances, double *distances,
+			Buffer pin)
 {
-	Assert(!recheckDistances && !distances);
+	Assert(!recheckDistances && !distances && !BufferIsValid(pin));
 	tbm_add_tuples(so->tbm, heapPtr, 1, recheck);
 	so->ntids++;
 }
@@ -954,10 +1019,9 @@ spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 
 /* storeRes subroutine for gettuple case */
 static void
-storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
-			  Datum leafValue, bool isnull,
-			  SpGistLeafTuple leafTuple, bool recheck,
-			  bool recheckDistances, double *nonNullDistances)
+storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue,
+			  bool isnull, SpGistLeafTuple leafTuple, bool recheck,
+			  bool recheckDistances, double *nonNullDistances, Buffer pin)
 {
 	Assert(so->nPtrs < MaxIndexTuplesPerPage);
 	so->heapPtrs[so->nPtrs] = *heapPtr;
@@ -1016,6 +1080,10 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
 		so->reconTups[so->nPtrs] = heap_form_tuple(so->reconTupDesc,
 												   leafDatums,
 												   leafIsnulls);
+
+		/* move the buffer pin, if required */
+		if (BufferIsValid(pin))
+			so->pagePin = pin;
 	}
 	so->nPtrs++;
 }
@@ -1065,7 +1133,19 @@ spggettuple(IndexScanDesc scan, ScanDirection dir)
 
 			for (i = 0; i < so->nPtrs; i++)
 				pfree(so->reconTups[i]);
+
+			if (so->nPtrs > 0)
+			{
+				Assert(BufferIsValid(so->pagePin));
+				ReleaseBuffer(so->pagePin);
+				so->pagePin = InvalidBuffer;
+			}
 		}
+		else
+		{
+			Assert(!BufferIsValid(so->pagePin));
+		}
+
 		so->iPtr = so->nPtrs = 0;
 
 		spgWalk(scan->indexRelation, so, false, storeGettuple);
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index 894aefa19e1..f02c270c5cc 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -629,7 +629,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
 
 	buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
 								RBM_NORMAL, bds->info->strategy);
-	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (PageIsNew(page))
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
index cb43a278f46..1948e53e2ff 100644
--- a/src/include/access/spgist_private.h
+++ b/src/include/access/spgist_private.h
@@ -175,6 +175,8 @@ typedef struct SpGistSearchItem
 	bool		isLeaf;			/* SearchItem is heap item */
 	bool		recheck;		/* qual recheck is needed */
 	bool		recheckDistances;	/* distance recheck is needed */
+	Buffer		buffer;			/* buffer pinned for this leaf tuple
+								 * (IOS-only) */
 
 	/* array with numberOfOrderBys entries */
 	double		distances[FLEXIBLE_ARRAY_MEMBER];
@@ -226,6 +228,7 @@ typedef struct SpGistScanOpaqueData
 	TupleDesc	reconTupDesc;	/* if so, descriptor for reconstructed tuples */
 	int			nPtrs;			/* number of TIDs found on current page */
 	int			iPtr;			/* index for scanning through same */
+	Buffer		pagePin;		/* output tuple's pinned buffer, if IOS */
 	ItemPointerData heapPtrs[MaxIndexTuplesPerPage];	/* TIDs from cur page */
 	bool		recheck[MaxIndexTuplesPerPage]; /* their recheck flags */
 	bool		recheckDistances[MaxIndexTuplesPerPage];	/* distance recheck
-- 
2.43.0

Reply via email to