From bd3b59355e013c26a66548d8c4ce4abde5c49b21 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 15 Jun 2025 18:06:57 -0400
Subject: [PATCH v2] Return TIDs in desc order during backwards scans.

Always return items to the scan in descending key space order during
nbtree backwards scans.  In particular, always return TIDs in descending
order when returning a group of duplicates to the scan, even when the
scan has to read from posting list tuples.  This makes nbtree backwards
scans tend to require fewer buffer hits, since the scan is less likely
to repeatedly pin and unpin the same heap page/buffer (it's exactly as
likely as in the forward scan case).

Commit 0d861bbb, which added nbtree deduplication, originally did things
this way to avoid problems with how _bt_killitems sets LP_DEAD bits on
posting list tuples when scanning backwards.  We can work around that by
replacing the so->killedItems[] array with a new itemDead field in the
BTScanPosItem struct (the struct used for so->currPos.items[] entries).
Now the kill_prior_tuple mechanism just sets this new itemDead field.

Getting rid of so->killedItems[] has an independent advantage: it
automatically prevents btgettuple/kill_prior_tuple from saving duplicate
TIDs.  These were known to confuse _bt_killitems in scenarios involving
posting list tuples.  That old issue probably didn't come up very often,
but it seems best to completely avoid it.

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wz=Wut2pKvbW-u3hJ_LXwsYeiXHiW8oN1GfbKPavcGo8Ow@mail.gmail.com
---
 src/include/access/nbtree.h           |  8 ++--
 src/backend/access/nbtree/nbtree.c    | 27 ++++-------
 src/backend/access/nbtree/nbtsearch.c | 37 +++++++--------
 src/backend/access/nbtree/nbtutils.c  | 65 +++++++++++++++++++++------
 4 files changed, 79 insertions(+), 58 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e709d2e0a..0517464c7 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -956,7 +956,8 @@ typedef struct BTScanPosItem	/* what we remember about each match */
 {
 	ItemPointerData heapTid;	/* TID of referenced heap item */
 	OffsetNumber indexOffset;	/* index item's location within page */
-	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+	uint16		tupleOffset:15, /* IndexTuple's offset in workspace, if any */
+				itemDead:1;		/* Mark IndexTuple LP_DEAD later on? */
 } BTScanPosItem;
 
 typedef struct BTScanPosData
@@ -1067,9 +1068,8 @@ typedef struct BTScanOpaqueData
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
 
-	/* info about killed items if any (killedItems is NULL if never used) */
-	int		   *killedItems;	/* currPos.items indexes of killed items */
-	int			numKilled;		/* number of currently stored items */
+	/* guides _bt_killitems on how to LP_DEAD mark known dead currPos items */
+	bool		itemDead;		/* dead items saved in so->currpos.items[]? */
 	bool		dropPin;		/* drop leaf pin before btgettuple returns? */
 
 	/*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1..e177c7086 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -252,18 +252,10 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 			{
 				/*
 				 * Yes, remember it for later. (We'll deal with all such
-				 * tuples at once right before leaving the index page.)  The
-				 * test for numKilled overrun is not just paranoia: if the
-				 * caller reverses direction in the indexscan then the same
-				 * item might get entered multiple times. It's not worth
-				 * trying to optimize that, so we don't detect it, but instead
-				 * just forget any excess entries.
+				 * tuples at once right before leaving the index page.)
 				 */
-				if (so->killedItems == NULL)
-					so->killedItems = (int *)
-						palloc(MaxTIDsPerBTreePage * sizeof(int));
-				if (so->numKilled < MaxTIDsPerBTreePage)
-					so->killedItems[so->numKilled++] = so->currPos.itemIndex;
+				so->currPos.items[so->currPos.itemIndex].itemDead = 1;
+				so->itemDead = true;
 			}
 
 			/*
@@ -361,8 +353,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->orderProcs = NULL;
 	so->arrayContext = NULL;
 
-	so->killedItems = NULL;		/* until needed */
-	so->numKilled = 0;
+	so->itemDead = false;
 
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
@@ -391,7 +382,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
+		if (so->itemDead)
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 		BTScanPosInvalidate(so->currPos);
@@ -405,7 +396,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 * race condition involving VACUUM setting pages all-visible in the VM.
 	 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
 	 *
-	 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
+	 * When we drop pins eagerly, the btgettuple mechanism that marks dead
 	 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
 	 * The scheme used to detect unsafe TID recycling won't work when scanning
 	 * unlogged relations (since it involves saving an affected page's LSN).
@@ -475,7 +466,7 @@ btendscan(IndexScanDesc scan)
 	if (BTScanPosIsValid(so->currPos))
 	{
 		/* Before leaving current page, deal with any killed items */
-		if (so->numKilled > 0)
+		if (so->itemDead)
 			_bt_killitems(scan);
 		BTScanPosUnpinIfPinned(so->currPos);
 	}
@@ -491,8 +482,6 @@ btendscan(IndexScanDesc scan)
 	/* so->arrayKeys and so->orderProcs are in arrayContext */
 	if (so->arrayContext != NULL)
 		MemoryContextDelete(so->arrayContext);
-	if (so->killedItems != NULL)
-		pfree(so->killedItems);
 	if (so->currTuples != NULL)
 		pfree(so->currTuples);
 	/* so->markTuples should not be pfree'd, see btrescan */
@@ -555,7 +544,7 @@ btrestrpos(IndexScanDesc scan)
 		if (BTScanPosIsValid(so->currPos))
 		{
 			/* Before leaving current page, deal with any killed items */
-			if (so->numKilled > 0)
+			if (so->itemDead)
 				_bt_killitems(scan);
 			BTScanPosUnpinIfPinned(so->currPos);
 		}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 36544ecfd..d209c381a 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1748,19 +1748,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID
-					 */
+					/* Set up posting list state (and remember first TID) */
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
 											  BTreeTupleGetPostingN(itup, 0),
 											  itup);
 					itemIndex++;
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all later TIDs (must be at least one) */
+					for (int i = 1; i < nitems; i++)
 					{
 						_bt_savepostingitem(so, itemIndex, offnum,
 											BTreeTupleGetPostingN(itup, i),
@@ -1935,25 +1934,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID.
-					 *
-					 * Note that we deliberately save/return items from
-					 * posting lists in ascending heap TID order for backwards
-					 * scans.  This allows _bt_killitems() to make a
-					 * consistent assumption about the order of items
-					 * associated with the same posting list tuple.
-					 */
+					/* Set up posting list state (and remember last TID) */
 					itemIndex--;
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
-											  BTreeTupleGetPostingN(itup, 0),
+											  BTreeTupleGetPostingN(itup, nitems - 1),
 											  itup);
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all prior TIDs (must be at least one) */
+					for (int i = nitems - 2; i >= 0; i--)
 					{
 						itemIndex--;
 						_bt_savepostingitem(so, itemIndex, offnum,
@@ -2010,6 +2002,7 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
+	currItem->itemDead = 0;		/* for now */
 	if (so->currTuples)
 	{
 		Size		itupsz = IndexTupleSize(itup);
@@ -2040,6 +2033,7 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->itemDead = 0;		/* for now */
 	if (so->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
@@ -2076,6 +2070,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->itemDead = 0;		/* for now */
 
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
@@ -2124,7 +2119,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 	Assert(BTScanPosIsValid(so->currPos));
 
 	/* Before leaving current page, deal with any killed items */
-	if (so->numKilled > 0)
+	if (so->itemDead)
 		_bt_killitems(scan);
 
 	/*
@@ -2222,7 +2217,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 
-	so->numKilled = 0;			/* just paranoia */
+	so->itemDead = false;		/* just paranoia */
 	so->markItemIndex = -1;		/* ditto */
 
 	/* Initialize so->currPos for the first page (page in so->currPos.buf) */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index c71d1b6f2..aafbfad6c 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3326,9 +3326,9 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
  * told us were killed
  *
- * scan->opaque, referenced locally through so, contains information about the
- * current page and killed tuples thereon (generally, this should only be
- * called if so->numKilled > 0).
+ * so->currPos contains information about tuples that are now known to be dead
+ * following a series of btgettuple calls made after _bt_readpage returned
+ * true, and before _bt_next asked to step off so->currPos's page.
  *
  * Caller should not have a lock on the so->currPos page, but must hold a
  * buffer pin when !so->dropPin.  When we return, it still won't be locked.
@@ -3360,16 +3360,27 @@ _bt_killitems(IndexScanDesc scan)
 	BTPageOpaque opaque;
 	OffsetNumber minoff;
 	OffsetNumber maxoff;
-	int			numKilled = so->numKilled;
+	OffsetNumber postingidxoffnum = InvalidOffsetNumber;
 	bool		killedsomething = false;
 	Buffer		buf;
 
-	Assert(numKilled > 0);
+	Assert(so->itemDead);
 	Assert(BTScanPosIsValid(so->currPos));
 	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
-	/* Always invalidate so->killedItems[] before leaving so->currPos */
-	so->numKilled = 0;
+	/*
+	 * Unset so->itemDead to avoid repeatedly processing so->currPos.items[]
+	 * again and again in scenarios involving mark and restore.
+	 *
+	 * We don't bother to unset individual itemDead bits.  When so->currPos is
+	 * saved in so->markPos, we shouldn't end up back here again (restoring
+	 * from so->markPos always unsets so->itemDead as needed by calling here).
+	 * If so->itemDead is set once more, then it's still safe to use a set of
+	 * itemDead bits that weren't all set before/after restoring a mark.  It's
+	 * also reasonably efficient, since we avoid trying to set an LP_DEAD bit
+	 * that's already been set.
+	 */
+	so->itemDead = false;
 
 	if (!so->dropPin)
 	{
@@ -3403,21 +3414,41 @@ _bt_killitems(IndexScanDesc scan)
 		/* Unmodified, hinting is safe */
 	}
 
+	/*
+	 * Iterate through so->currPos.items[], and LP_DEAD index tuples whose
+	 * item array element's itemDead field has been set.
+	 *
+	 * Note: we rely on so->currPos.items[] always being in the same
+	 * leaf-page-wise order, regardless of the _bt_readpage scan direction:
+	 * items must appear in ASC page offset number order.  Posting list tuple
+	 * entries from the same tuple/page offset must appear in ASC TID order.
+	 */
 	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
-	for (int i = 0; i < numKilled; i++)
+	for (int i = so->currPos.firstItem; i <= so->currPos.lastItem; i++)
 	{
-		int			itemIndex = so->killedItems[i];
-		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
+		BTScanPosItem *kitem = &so->currPos.items[i];
 		OffsetNumber offnum = kitem->indexOffset;
 
-		Assert(itemIndex >= so->currPos.firstItem &&
-			   itemIndex <= so->currPos.lastItem);
 		if (offnum < minoff)
 			continue;			/* pure paranoia */
+
+		/* Skip over items not marked ItemDead up front */
+		if (!kitem->itemDead)
+			continue;
+
+		if (offnum == postingidxoffnum)
+		{
+			/*
+			 * This item is a TID from a posting list tuple that has already
+			 * been completely processed
+			 */
+			continue;
+		}
+
 		while (offnum <= maxoff)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
@@ -3430,6 +3461,8 @@ _bt_killitems(IndexScanDesc scan)
 				int			nposting = BTreeTupleGetNPosting(ituple);
 				int			j;
 
+				postingidxoffnum = offnum;	/* Remember work in outer loop */
+
 				/*
 				 * We rely on the convention that heap TIDs in the scanpos
 				 * items array are stored in ascending heap TID order for a
@@ -3448,6 +3481,8 @@ _bt_killitems(IndexScanDesc scan)
 				{
 					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
 
+					Assert(kitem->itemDead);
+
 					if (!ItemPointerEquals(item, &kitem->heapTid))
 						break;	/* out of posting list loop */
 
@@ -3472,8 +3507,10 @@ _bt_killitems(IndexScanDesc scan)
 					 * kitem is also the last heap TID in the last index tuple
 					 * correctly -- posting tuple still gets killed).
 					 */
-					if (pi < numKilled)
-						kitem = &so->currPos.items[so->killedItems[pi++]];
+					if (pi <= so->currPos.lastItem &&
+						so->currPos.items[pi].indexOffset == offnum &&
+						so->currPos.items[pi].itemDead)
+						kitem = &so->currPos.items[pi++];
 				}
 
 				/*
-- 
2.49.0

