From 7d307b785dbd8106e9f11120164039d3723fa1ed Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 15 Jun 2025 18:06:57 -0400
Subject: [PATCH v3] Return TIDs in desc order during backwards scans.

Always return items to the scan in descending key space order during
nbtree backwards scans.  In particular, always return TIDs in descending
order when returning a group of duplicates to the scan whose TIDs come
from an nbtree posting list tuples.  This makes nbtree backwards scans
tend to require fewer buffer hits, since the scan is less likely to
repeatedly pin and unpin the same heap page/buffer (we'll get exactly as
many buffer hits as we get with a similar forwards scan case).

Commit 0d861bbb, which added nbtree deduplication, originally did things
this way to avoid problems with how _bt_killitems sets LP_DEAD bits on
posting list tuples when scanning backwards.  We can work around that by
replacing the so->killedItems[] array with a new itemDead bit in the
BTScanPosItem struct (the struct used for so->currPos.items[] entries).
Now the kill_prior_tuple mechanism just sets this new itemDead field.
Now the _bt_killitems loop always iterates through so->currPos.items[]
in leaf-page-wise order, no matter the scan direction.

Getting rid of so->killedItems[] has an independent advantage: it
automatically prevents btgettuple/kill_prior_tuple from saving duplicate
TIDs (since setting a so->currPos.items[]'s itemDead bit is idempotent).
That way _bt_killitems cannot possibly examine the same tuple TID twice.
This behavior probably wasn't harmful, but it still seems worth avoiding.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Mircea Cadariu <cadariu.mircea@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wz=Wut2pKvbW-u3hJ_LXwsYeiXHiW8oN1GfbKPavcGo8Ow@mail.gmail.com
---
 src/include/access/nbtree.h           |  10 ++-
 src/backend/access/nbtree/nbtree.c    |  28 ++-----
 src/backend/access/nbtree/nbtsearch.c |  37 ++++-----
 src/backend/access/nbtree/nbtutils.c  | 105 +++++++++++++++-----------
 4 files changed, 89 insertions(+), 91 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e709d2e0a..440cfc8df 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -951,12 +951,15 @@ typedef BTVacuumPostingData *BTVacuumPosting;
  * allowing the same key to be returned for each TID in the posting list
  * tuple.
  */
+#define BT_TUPLE_UNKNOWN	0	/* Don't yet know whether to mark item */
+#define BT_TUPLE_DEAD		1	/* Try to LP_DEAD-mark item's tuple */
 
 typedef struct BTScanPosItem	/* what we remember about each match */
 {
 	ItemPointerData heapTid;	/* TID of referenced heap item */
 	OffsetNumber indexOffset;	/* index item's location within page */
-	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+	uint16		tupleOffset:15, /* IndexTuple's offset in workspace, if any */
+				itemDead:1;		/* Mark IndexTuple LP_DEAD later on? */
 } BTScanPosItem;
 
 typedef struct BTScanPosData
@@ -1067,9 +1070,8 @@ typedef struct BTScanOpaqueData
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
 
-	/* info about killed items if any (killedItems is NULL if never used) */
-	int		   *killedItems;	/* currPos.items indexes of killed items */
-	int			numKilled;		/* number of currently stored items */
+	/* guides _bt_killitems on how to LP_DEAD mark known dead currPos items */
+	bool		itemsDead;		/* dead items saved in so->currpos.items[]? */
 	bool		dropPin;		/* drop leaf pin before btgettuple returns? */
 
 	/*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1..5a3aff76a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -252,18 +252,10 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 			{
 				/*
 				 * Yes, remember it for later. (We'll deal with all such
-				 * tuples at once right before leaving the index page.)  The
-				 * test for numKilled overrun is not just paranoia: if the
-				 * caller reverses direction in the indexscan then the same
-				 * item might get entered multiple times. It's not worth
-				 * trying to optimize that, so we don't detect it, but instead
-				 * just forget any excess entries.
+				 * tuples at once right before leaving the index page.)
 				 */
-				if (so->killedItems == NULL)
-					so->killedItems = (int *)
-						palloc(MaxTIDsPerBTreePage * sizeof(int));
-				if (so->numKilled < MaxTIDsPerBTreePage)
-					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
+				so->currPos.items[so->currPos.itemIndex].itemDead = BT_TUPLE_DEAD;
+				so->itemsDead = true;
 			}
 
 			/*
@@ -361,9 +353,6 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->orderProcs = NULL;
 	so->arrayContext = NULL;
 
-	so->killedItems = NULL;		/* until needed */
-	so->numKilled = 0;
-
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
@@ -391,7 +380,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
+		if (so->itemsDead)
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 		BTScanPosInvalidate(so->currPos);
@@ -405,7 +394,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 * race condition involving VACUUM setting pages all-visible in the VM.
 	 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
 	 *
-	 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
+	 * When we drop pins eagerly, the btgettuple mechanism that marks dead
 	 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
 	 * The scheme used to detect unsafe TID recycling won't work when scanning
 	 * unlogged relations (since it involves saving an affected page's LSN).
@@ -420,6 +409,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 *
 	 * Note: so->dropPin should never change across rescans.
 	 */
+	so->itemsDead = false;
 	so->dropPin = (!scan->xs_want_itup &&
 				   IsMVCCSnapshot(scan->xs_snapshot) &&
 				   RelationNeedsWAL(scan->indexRelation) &&
@@ -475,7 +465,7 @@ btendscan(IndexScanDesc scan)
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
+		if (so->itemsDead)
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 	}
@@ -491,8 +481,6 @@ btendscan(IndexScanDesc scan)
 	/* so->arrayKeys and so->orderProcs are in arrayContext */
 	if (so->arrayContext != NULL)
 		MemoryContextDelete(so->arrayContext);
-	if (so->killedItems != NULL)
-		pfree(so->killedItems);
 	if (so->currTuples != NULL)
 		pfree(so->currTuples);
 	/* so->markTuples should not be pfree'd, see btrescan */
@@ -555,7 +543,7 @@ btrestrpos(IndexScanDesc scan)
 		if (BTScanPosIsValid(so->currPos))
 		{
 			/* Before leaving current page, deal with any killed items */
-			if (so->numKilled > 0)
+			if (so->itemsDead)
 				_bt_killitems(scan);
 			BTScanPosUnpinIfPinned(so->currPos);
 		}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index d69798795..c61177859 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1795,19 +1795,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID
-					 */
+					/* Set up posting list state (and remember first TID) */
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
 											  BTreeTupleGetPostingN(itup, 0),
 											  itup);
 					itemIndex++;
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all later TIDs (must be at least one) */
+					for (int i = 1; i < nitems; i++)
 					{
 						_bt_savepostingitem(so, itemIndex, offnum,
 											BTreeTupleGetPostingN(itup, i),
@@ -1982,25 +1981,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID.
-					 *
-					 * Note that we deliberately save/return items from
-					 * posting lists in ascending heap TID order for backwards
-					 * scans.  This allows _bt_killitems() to make a
-					 * consistent assumption about the order of items
-					 * associated with the same posting list tuple.
-					 */
+					/* Set up posting list state (and remember last TID) */
 					itemIndex--;
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
-											  BTreeTupleGetPostingN(itup, 0),
+											  BTreeTupleGetPostingN(itup, nitems - 1),
 											  itup);
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all prior TIDs (must be at least one) */
+					for (int i = nitems - 2; i >= 0; i--)
 					{
 						itemIndex--;
 						_bt_savepostingitem(so, itemIndex, offnum,
@@ -2057,6 +2049,7 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
+	currItem->itemDead = BT_TUPLE_UNKNOWN;	/* for now */
 	if (so->currTuples)
 	{
 		Size		itupsz = IndexTupleSize(itup);
@@ -2087,6 +2080,7 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->itemDead = BT_TUPLE_UNKNOWN;	/* for now */
 	if (so->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
@@ -2123,6 +2117,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->itemDead = BT_TUPLE_UNKNOWN;	/* for now */
 
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
@@ -2171,7 +2166,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	Assert(BTScanPosIsValid(so->currPos));
 
 	/* Before leaving current page, deal with any killed items */
-	if (so->numKilled > 0)
+	if (so->itemsDead)
 		_bt_killitems(scan);
 
 	/*
@@ -2269,7 +2264,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	so->numKilled = 0;			/* just paranoia */
+	so->itemsDead = false;		/* just paranoia */
 	so->markItemIndex = -1;		/* ditto */
 
 	/* Initialize so->currPos for the first page (page in so->currPos.buf) */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 9aed20799..b5c9a6146 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3227,9 +3227,9 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
  * told us were killed
  *
- * scan->opaque, referenced locally through so, contains information about the
- * current page and killed tuples thereon (generally, this should only be
- * called if so->numKilled > 0).
+ * so->currPos contains information about tuples that are now known to be dead
+ * following a series of btgettuple calls made after _bt_readpage returned
+ * true.  Called when the scan is about to step off so->currPos's page.
  *
  * Caller should not have a lock on the so->currPos page, but must hold a
  * buffer pin when !so->dropPin.  When we return, it still won't be locked.
@@ -3261,16 +3261,18 @@ _bt_killitems(IndexScanDesc scan)
 	BTPageOpaque opaque;
 	OffsetNumber minoff;
 	OffsetNumber maxoff;
-	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
 	Buffer		buf;
 
-	Assert(numKilled > 0);
+	Assert(so->itemsDead);
 	Assert(BTScanPosIsValid(so->currPos));
 	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
-	/* Always invalidate so->killedItems[] before leaving so->currPos */
-	so->numKilled = 0;
+	/*
+	 * Unset so->itemsDead to avoid repeatedly processing so->currPos.items[]
+	 * again and again in scans that repeatedly restore the same mark
+	 */
+	so->itemsDead = false;
 
 	if (!so->dropPin)
 	{
@@ -3304,21 +3306,39 @@ _bt_killitems(IndexScanDesc scan)
 		/* Unmodified, hinting is safe */
 	}
 
+	/*
+	 * Iterate through so->currPos.items[], and LP_DEAD mark index tuples
+	 * whose item array element's itemDead field has been set
+	 */
 	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
-	for (int i = 0; i < numKilled; i++)
+	for (int itemIndex = so->currPos.firstItem;
+		 itemIndex <= so->currPos.lastItem;
+		 itemIndex++)
 	{
-		int			itemIndex = so->killedItems[i];
 		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
 		OffsetNumber offnum = kitem->indexOffset;
 
-		Assert(itemIndex >= so->currPos.firstItem &&
-			   itemIndex <= so->currPos.lastItem);
+		/* Quickly skip over items never ItemDead-set by btgettuple */
+		if (kitem->itemDead == BT_TUPLE_UNKNOWN)
+			continue;
+
+		/* defensively unset item's itemDead bit */
+		kitem->itemDead = BT_TUPLE_UNKNOWN;
+
 		if (offnum < minoff)
-			continue;			/* pure paranoia */
+		{
+			/*
+			 * Page must have originally been the rightmost page, but has
+			 * since split
+			 */
+			Assert(!so->dropPin);
+			offnum = minoff;
+		}
+
 		while (offnum <= maxoff)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
@@ -3327,29 +3347,15 @@ _bt_killitems(IndexScanDesc scan)
 
 			if (BTreeTupleIsPosting(ituple))
 			{
-				int			pi = i + 1;
+				int			pItemIndex = itemIndex + 1;
 				int			nposting = BTreeTupleGetNPosting(ituple);
 				int			j;
 
-				/*
-				 * We rely on the convention that heap TIDs in the scanpos
-				 * items array are stored in ascending heap TID order for a
-				 * group of TIDs that originally came from a posting list
-				 * tuple.  This convention even applies during backwards
-				 * scans, where returning the TIDs in descending order might
-				 * seem more natural.  This is about effectiveness, not
-				 * correctness.
-				 *
-				 * Note that the page may have been modified in almost any way
-				 * since we first read it (in the !so->dropPin case), so it's
-				 * possible that this posting list tuple wasn't a posting list
-				 * tuple when we first encountered its heap TIDs.
-				 */
 				for (j = 0; j < nposting; j++)
 				{
-					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+					ItemPointer ptid = BTreeTupleGetPostingN(ituple, j);
 
-					if (!ItemPointerEquals(item, &kitem->heapTid))
+					if (!ItemPointerEquals(ptid, &kitem->heapTid))
 						break;	/* out of posting list loop */
 
 					/*
@@ -3359,26 +3365,33 @@ _bt_killitems(IndexScanDesc scan)
 					 */
 					Assert(kitem->indexOffset == offnum || !so->dropPin);
 
-					/*
-					 * Read-ahead to later kitems here.
-					 *
-					 * We rely on the assumption that not advancing kitem here
-					 * will prevent us from considering the posting list tuple
-					 * fully dead by not matching its next heap TID in next
-					 * loop iteration.
-					 *
-					 * If, on the other hand, this is the final heap TID in
-					 * the posting list tuple, then tuple gets killed
-					 * regardless (i.e. we handle the case where the last
-					 * kitem is also the last heap TID in the last index tuple
-					 * correctly -- posting tuple still gets killed).
-					 */
-					if (pi < numKilled)
-						kitem = &so->currPos.items[so->killedItems[pi++]];
+					/* Read-ahead to later kitems here */
+					if (pItemIndex > so->currPos.lastItem ||
+						so->currPos.items[pItemIndex].itemDead == BT_TUPLE_UNKNOWN)
+					{
+						/*
+						 * We've run out of kitems, or the next kitem wasn't
+						 * itemDead-set by btgettuple.  This _usually_ means
+						 * that ituple cannnot safely be LP_DEAD-marked.
+						 *
+						 * If the current ptid is ituple's max heap TID, then
+						 * "j == nposting" _will_ be satisfied; ituple _will_
+						 * therefore be LP_DEAD-marked after all.
+						 *
+						 * If the current ptid is _not_ ituple's max heap TID,
+						 * then a failed attempt at matching the _current_
+						 * kitem to the _next_ ptid from ituple will prevent
+						 * ituple from being incorrectly LP_DEAD-marked.
+						 */
+						continue;
+					}
+
+					/* Advance to next kitem itemDead-set by btgettuple */
+					kitem = &so->currPos.items[pItemIndex++];
 				}
 
 				/*
-				 * Don't bother advancing the outermost loop's int iterator to
+				 * Don't bother advancing the outermost loop's itemIndex to
 				 * avoid processing killed items that relate to the same
 				 * offnum/posting list tuple.  This micro-optimization hardly
 				 * seems worth it.  (Further iterations of the outermost loop
-- 
2.50.0

