From 1b828c768e30583c1a6ae4ad9a9352ada79c3917 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 15 Jun 2025 18:06:57 -0400
Subject: [PATCH v4] Return TIDs in desc order during backwards scans.

Always return TIDs in descending order when returning a group of
duplicates to the scan whose TIDs come from an nbtree posting list
tuples during nbtree backwards scans.  This makes backwards scans tend
to require fewer buffer hits, since the scan is less likely to
repeatedly pin and unpin the same heap page/buffer (we'll get exactly as
many buffer hits as we get with a similar forwards scan case).

Commit 0d861bbb, which added nbtree deduplication, originally did things
this way to avoid interfering with _bt_killitems's previous approach to
setting LP_DEAD bits on posting list tuples.  _bt_killitems made a soft
assumption that it can iterate through posting lists in TID order,
finding corresponding killItems[]/so->currPos.items[] entries in that
same order (even when scanning backwards).  This worked out because of
the prior _bt_readpage backwards scan behavior. If we just change the
backwards scan posting list logic in _bt_readpage, and don't alter
_bt_killitems itself, we'll break its soft assumption.

Avoid that problem by converting the so->killedItems[] array into a
Bitmapset (that stores offsets into the so->currPos.items[] array) and
having _bt_killitems iterate through the Bitmapset.  That way the order
that items are originally saved in killItems can't matter.  There is an
additional benefit to using a Bitmapset for this: it is no longer
possible to run out of space in killedItems[] in cases where a
scrollable cursor repeatedly saves the same dead TID (from the same
page/so->currPos) by scrolling back and forth over it.  Calling
bms_add_member with the same item a second or a third time cannot make
_bt_killitems and less effective, since bms_add_member is idempotent.

Credit for the idea of using a Bitmapset for this goes to Tomas Vondra.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Mircea Cadariu <cadariu.mircea@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wz=Wut2pKvbW-u3hJ_LXwsYeiXHiW8oN1GfbKPavcGo8Ow@mail.gmail.com
---
 src/include/access/nbtree.h           |  5 ++-
 src/backend/access/nbtree/nbtree.c    | 27 +++++----------
 src/backend/access/nbtree/nbtsearch.c | 35 ++++++++------------
 src/backend/access/nbtree/nbtutils.c  | 47 +++++++++++++--------------
 4 files changed, 46 insertions(+), 68 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 16be5c7a9..000a12dea 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1067,9 +1067,8 @@ typedef struct BTScanOpaqueData
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
 
-	/* info about killed items if any (killedItems is NULL if never used) */
-	int		   *killedItems;	/* currPos.items indexes of killed items */
-	int			numKilled;		/* number of currently stored items */
+	/* info about killed items, if any */
+	Bitmapset  *killedItems;	/* currPos.items indexes of killed items */
 	bool		dropPin;		/* drop leaf pin before btgettuple returns? */
 
 	/*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1..251a8aba6 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -252,18 +252,10 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 			{
 				/*
 				 * Yes, remember it for later. (We'll deal with all such
-				 * tuples at once right before leaving the index page.)  The
-				 * test for numKilled overrun is not just paranoia: if the
-				 * caller reverses direction in the indexscan then the same
-				 * item might get entered multiple times. It's not worth
-				 * trying to optimize that, so we don't detect it, but instead
-				 * just forget any excess entries.
+				 * tuples at once right before leaving the index page.)
 				 */
-				if (so->killedItems == NULL)
-					so->killedItems = (int *)
-						palloc(MaxTIDsPerBTreePage * sizeof(int));
-				if (so->numKilled < MaxTIDsPerBTreePage)
-					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
+				so->killedItems = bms_add_member(so->killedItems,
+												 so->currPos.itemIndex);
 			}
 
 			/*
@@ -362,7 +354,6 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->arrayContext = NULL;
 
 	so->killedItems = NULL;		/* until needed */
-	so->numKilled = 0;
 
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
@@ -391,7 +382,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
+		if (!bms_is_empty(so->killedItems))
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 		BTScanPosInvalidate(so->currPos);
@@ -405,7 +396,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 * race condition involving VACUUM setting pages all-visible in the VM.
 	 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
 	 *
-	 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
+	 * When we drop pins eagerly, the btgettuple mechanism that marks dead
 	 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
 	 * The scheme used to detect unsafe TID recycling won't work when scanning
 	 * unlogged relations (since it involves saving an affected page's LSN).
@@ -475,7 +466,7 @@ btendscan(IndexScanDesc scan)
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
+		if (!bms_is_empty(so->killedItems))
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 	}
@@ -483,8 +474,6 @@ btendscan(IndexScanDesc scan)
 	so->markItemIndex = -1;
 	BTScanPosUnpinIfPinned(so->markPos);
 
-	/* No need to invalidate positions, the RAM is about to be freed. */
-
 	/* Release storage */
 	if (so->keyData != NULL)
 		pfree(so->keyData);
@@ -492,7 +481,7 @@ btendscan(IndexScanDesc scan)
 	if (so->arrayContext != NULL)
 		MemoryContextDelete(so->arrayContext);
 	if (so->killedItems != NULL)
-		pfree(so->killedItems);
+		bms_free(so->killedItems);
 	if (so->currTuples != NULL)
 		pfree(so->currTuples);
 	/* so->markTuples should not be pfree'd, see btrescan */
@@ -555,7 +544,7 @@ btrestrpos(IndexScanDesc scan)
 		if (BTScanPosIsValid(so->currPos))
 		{
 			/* Before leaving current page, deal with any killed items */
-			if (so->numKilled > 0)
+			if (!bms_is_empty(so->killedItems))
 				_bt_killitems(scan);
 			BTScanPosUnpinIfPinned(so->currPos);
 		}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 0605356ec..e50bee44b 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1797,19 +1797,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID
-					 */
+					/* Set up posting list state (and remember first TID) */
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
 											  BTreeTupleGetPostingN(itup, 0),
 											  itup);
 					itemIndex++;
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all later TIDs (must be at least one) */
+					for (int i = 1; i < nitems; i++)
 					{
 						_bt_savepostingitem(so, itemIndex, offnum,
 											BTreeTupleGetPostingN(itup, i),
@@ -1984,25 +1983,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID.
-					 *
-					 * Note that we deliberately save/return items from
-					 * posting lists in ascending heap TID order for backwards
-					 * scans.  This allows _bt_killitems() to make a
-					 * consistent assumption about the order of items
-					 * associated with the same posting list tuple.
-					 */
+					/* Set up posting list state (and remember last TID) */
 					itemIndex--;
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
-											  BTreeTupleGetPostingN(itup, 0),
+											  BTreeTupleGetPostingN(itup, nitems - 1),
 											  itup);
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all prior TIDs (must be at least one) */
+					for (int i = nitems - 2; i >= 0; i--)
 					{
 						itemIndex--;
 						_bt_savepostingitem(so, itemIndex, offnum,
@@ -2173,7 +2165,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	Assert(BTScanPosIsValid(so->currPos));
 
 	/* Before leaving current page, deal with any killed items */
-	if (so->numKilled > 0)
+	if (!bms_is_empty(so->killedItems))
 		_bt_killitems(scan);
 
 	/*
@@ -2268,8 +2260,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	so->numKilled = 0;			/* just paranoia */
-	so->markItemIndex = -1;		/* ditto */
+	so->markItemIndex = -1;		/* just paranoia */
 
 	/* Initialize so->currPos for the first page (page in so->currPos.buf) */
 	if (so->needPrimScan)
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index ab0f98b02..dae47ecfb 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3372,9 +3372,9 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
  * told us were killed
  *
- * scan->opaque, referenced locally through so, contains information about the
- * current page and killed tuples thereon (generally, this should only be
- * called if so->numKilled > 0).
+ * so->currPos contains information about tuples that are now known to be dead
+ * following a series of btgettuple calls made after _bt_readpage returned
+ * true.  Called when the scan is about to step off so->currPos's page.
  *
  * Caller should not have a lock on the so->currPos page, but must hold a
  * buffer pin when !so->dropPin.  When we return, it still won't be locked.
@@ -3406,17 +3406,14 @@ _bt_killitems(IndexScanDesc scan)
 	BTPageOpaque opaque;
 	OffsetNumber minoff;
 	OffsetNumber maxoff;
-	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
 	Buffer		buf;
+	int			itemIndex = -1;
 
-	Assert(numKilled > 0);
+	Assert(!bms_is_empty(so->killedItems));
 	Assert(BTScanPosIsValid(so->currPos));
 	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
-	/* Always invalidate so->killedItems[] before leaving so->currPos */
-	so->numKilled = 0;
-
 	if (!so->dropPin)
 	{
 		/*
@@ -3442,6 +3439,10 @@ _bt_killitems(IndexScanDesc scan)
 		{
 			/* Modified, give up on hinting */
 			_bt_relbuf(rel, buf);
+
+			/* Always invalidate before leaving so->currPos */
+			bms_free(so->killedItems);
+			so->killedItems = NULL;
 			return;
 		}
 
@@ -3453,9 +3454,8 @@ _bt_killitems(IndexScanDesc scan)
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
-	for (int i = 0; i < numKilled; i++)
+	while ((itemIndex = bms_next_member(so->killedItems, itemIndex)) >= 0)
 	{
-		int			itemIndex = so->killedItems[i];
 		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
 		OffsetNumber offnum = kitem->indexOffset;
 
@@ -3471,7 +3471,7 @@ _bt_killitems(IndexScanDesc scan)
 
 			if (BTreeTupleIsPosting(ituple))
 			{
-				int			pi = i + 1;
+				int			nextIndex = itemIndex;
 				int			nposting = BTreeTupleGetNPosting(ituple);
 				int			j;
 
@@ -3479,10 +3479,7 @@ _bt_killitems(IndexScanDesc scan)
 				 * We rely on the convention that heap TIDs in the scanpos
 				 * items array are stored in ascending heap TID order for a
 				 * group of TIDs that originally came from a posting list
-				 * tuple.  This convention even applies during backwards
-				 * scans, where returning the TIDs in descending order might
-				 * seem more natural.  This is about effectiveness, not
-				 * correctness.
+				 * tuple.
 				 *
 				 * Note that the page may have been modified in almost any way
 				 * since we first read it (in the !so->dropPin case), so it's
@@ -3517,18 +3514,16 @@ _bt_killitems(IndexScanDesc scan)
 					 * kitem is also the last heap TID in the last index tuple
 					 * correctly -- posting tuple still gets killed).
 					 */
-					if (pi < numKilled)
-						kitem = &so->currPos.items[so->killedItems[pi++]];
+					nextIndex = bms_next_member(so->killedItems, nextIndex);
+					if (nextIndex >= 0)
+						kitem = &so->currPos.items[nextIndex];
 				}
 
 				/*
-				 * Don't bother advancing the outermost loop's int iterator to
-				 * avoid processing killed items that relate to the same
-				 * offnum/posting list tuple.  This micro-optimization hardly
-				 * seems worth it.  (Further iterations of the outermost loop
-				 * will fail to match on this same posting list's first heap
-				 * TID instead, so we'll advance to the next offnum/index
-				 * tuple pretty quickly.)
+				 * Don't advance itemIndex for outermost loop, no matter how
+				 * nextIndex was advanced.  It's possible that items whose
+				 * TIDs weren't matched in posting list can still be killed
+				 * (there might be a later tuple whose TID is a match).
 				 */
 				if (j == nposting)
 					killtuple = true;
@@ -3572,6 +3567,10 @@ _bt_killitems(IndexScanDesc scan)
 		_bt_unlockbuf(rel, buf);
 	else
 		_bt_relbuf(rel, buf);
+
+	/* Always invalidate before leaving so->currPos */
+	bms_free(so->killedItems);
+	so->killedItems = NULL;
 }
 
 
-- 
2.51.0

