From e8e242f86029afe86c5569c88346cabd8455df87 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 15 Jun 2025 18:06:57 -0400
Subject: [PATCH v1] Return TIDs in desc order during backwards scans.

Always return items to the scan in descending key space order during
nbtree backwards scans.  In particular, always return TIDs in descending
order when returning a group of duplicates to the scan, even when the
scan has to read from posting list tuples.  This makes nbtree backwards
scans tend to require fewer buffer hits, since the scan is less likely
to repeatedly pin and unpin the same heap page/buffer (it's exactly as
likely as in the forward scan case).

Commit 0d861bbb, which added nbtree deduplication, originally did things
this way to avoid problems with how _bt_killitems sets LP_DEAD bits on
posting list tuples when scanning backwards.  We can work around that by
having _bt_killitems sort so->killedItems[] in page offset number order
(while tiebreaking on heap TID order for items from the same posting
list).
---
 src/backend/access/nbtree/nbtree.c    |  5 ++-
 src/backend/access/nbtree/nbtsearch.c | 30 ++++++---------
 src/backend/access/nbtree/nbtutils.c  | 53 ++++++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fdff960c1..77a48f1ad 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -257,7 +257,10 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 				 * caller reverses direction in the indexscan then the same
 				 * item might get entered multiple times. It's not worth
 				 * trying to optimize that, so we don't detect it, but instead
-				 * just forget any excess entries.
+				 * just forget any excess entries.  This happens infrequently.
+				 * We have enough so->killedItems[] space to store every TID
+				 * (whether dead or alive) several times over with most pages.
+				 * MaxTIDsPerBTreePage is a conservatively large array size.
 				 */
 				if (so->killedItems == NULL)
 					so->killedItems = (int *)
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 36544ecfd..db98cefaa 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1748,19 +1748,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID
-					 */
+					/* Set up posting list state (and remember first TID) */
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
 											  BTreeTupleGetPostingN(itup, 0),
 											  itup);
 					itemIndex++;
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all later TIDs (must be at least one) */
+					for (int i = 1; i < nitems; i++)
 					{
 						_bt_savepostingitem(so, itemIndex, offnum,
 											BTreeTupleGetPostingN(itup, i),
@@ -1935,25 +1934,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 				}
 				else
 				{
+					int			nitems = BTreeTupleGetNPosting(itup);
 					int			tupleOffset;
 
-					/*
-					 * Set up state to return posting list, and remember first
-					 * TID.
-					 *
-					 * Note that we deliberately save/return items from
-					 * posting lists in ascending heap TID order for backwards
-					 * scans.  This allows _bt_killitems() to make a
-					 * consistent assumption about the order of items
-					 * associated with the same posting list tuple.
-					 */
+					/* Set up posting list state (and remember last TID) */
 					itemIndex--;
 					tupleOffset =
 						_bt_setuppostingitems(so, itemIndex, offnum,
-											  BTreeTupleGetPostingN(itup, 0),
+											  BTreeTupleGetPostingN(itup, nitems - 1),
 											  itup);
-					/* Remember additional TIDs */
-					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+					/* Remember all prior TIDs (must be at least one) */
+					for (int i = nitems - 2; i >= 0; i--)
 					{
 						itemIndex--;
 						_bt_savepostingitem(so, itemIndex, offnum,
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index c71d1b6f2..88ed06a8c 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -20,6 +20,8 @@
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "commands/progress.h"
+#include "common/int.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
@@ -66,6 +68,7 @@ static bool _bt_check_rowcompare(ScanKey skey,
 								 ScanDirection dir, bool forcenonrequired, bool *continuescan);
 static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
 									 int tupnatts, TupleDesc tupdesc);
+static int	_bt_killitems_cmp(const void *a, const void *b, void *arg);
 static int	_bt_keep_natts(Relation rel, IndexTuple lastleft,
 						   IndexTuple firstright, BTScanInsert itup_key);
 
@@ -3322,6 +3325,27 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
 	}
 }
 
+/*
+ * _bt_killeditem_cmp() -- qsort_arg comparison function used to sort and
+ * uniqueify so->killedItems[]
+ */
+static int
+_bt_killitems_cmp(const void *a, const void *b, void *arg)
+{
+	int			killedItemA = *((int *) a);
+	int			killedItemB = *((int *) b);
+	BTScanPosItem *items = (BTScanPosItem *) arg;
+	BTScanPosItem *itemA = &items[killedItemA];
+	BTScanPosItem *itemB = &items[killedItemB];
+
+	/* Compare page offset numbers */
+	if (itemA->indexOffset != itemB->indexOffset)
+		return pg_cmp_u16(itemA->indexOffset, itemB->indexOffset);
+
+	/* Tiebreak on heap TID (needed with posting lists) */
+	return ItemPointerCompare(&itemA->heapTid, &itemB->heapTid);
+}
+
 /*
  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
  * told us were killed
@@ -3364,13 +3388,40 @@ _bt_killitems(IndexScanDesc scan)
 	bool		killedsomething = false;
 	Buffer		buf;
 
-	Assert(numKilled > 0);
+	Assert(numKilled > 0 && numKilled <= MaxTIDsPerBTreePage);
 	Assert(BTScanPosIsValid(so->currPos));
 	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
 	/* Always invalidate so->killedItems[] before leaving so->currPos */
 	so->numKilled = 0;
 
+	/*
+	 * so->killedItems[] will be in whatever order the scan returned items in,
+	 * which means that the items will appear backwards during backwards scans
+	 * (though so->currPos.items[] must already be in ascending page offset
+	 * number order, no matter what direction _bt_readpage used earlier on).
+	 *
+	 * Sort and uniqueify so->killedItems[] using page offset number order,
+	 * while tie-breaking on heap TID (for entries in the same posting list).
+	 * Without this, the loop below would get confused about posting lists, at
+	 * least when _bt_readpage read so->currPos in backwards scan direction.
+	 *
+	 * We always sort and uniqueify here in case the scan direction changed
+	 * between the call to _bt_readpage and now.  so->killedItems[] might
+	 * otherwise contain multiple instances of the same entry/TID, which would
+	 * also confuse our processing of posting lists tuples in the loop below.
+	 * This makes no difference during standard forwards scans, but it seems
+	 * best to err on the side of consistently setting LP_DEAD bits.
+	 */
+	if (numKilled > 1)
+	{
+		qsort_arg(so->killedItems, numKilled, sizeof(int),
+				  _bt_killitems_cmp, &so->currPos.items);
+		numKilled =
+			qunique_arg(so->killedItems, numKilled, sizeof(int),
+						_bt_killitems_cmp, &so->currPos.items);
+	}
+
 	if (!so->dropPin)
 	{
 		/*
-- 
2.49.0

