On Fri, Oct 11, 2024 at 7:29 PM Peter Geoghegan <p...@bowt.ie> wrote:
> Attached is v5

Now I'm attaching a v6, which further polishes things. Current plan is to
commit something very close to this in the next day or two.

v6 is mostly just further comment polishing. But it also merges together
the two existing _bt_readnextpage loops (for forward and backward scan
directions) into one single loop that handles everything. This is
definitely a win for brevity, and might also be a win for clarity --
but I'm not 100% sure which way I prefer it just yet.

I'll need to make a final decision on this loop merging business
before committing. Anybody else have an opinion, either way?

--
Peter Geoghegan
From 17959762e34cf8ee75af37f98dc5b085d05c7860 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 8 Oct 2024 11:40:54 -0400
Subject: [PATCH v6] Optimize and simplify nbtree backwards scans.

This enhancement completely supersedes the one added by commit 3f44959f.

Author: Matthias van de Meent <boekewurm+postgres@gmail.com>
Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAEze2WgpBGRgTTxTWVPXc9+PB6fc1a7t+VyGXHzfnrFXcQVxnA@mail.gmail.com
Discussion: https://postgr.es/m/CAH2-WzkBTuFv7W2+84jJT8mWZLXVL0GHq2hMUTn6c9Vw=eYrCw@mail.gmail.com
---
 src/include/access/nbtree.h           |  23 +-
 src/backend/access/nbtree/nbtree.c    |  77 ++-
 src/backend/access/nbtree/nbtsearch.c | 744 ++++++++++++--------------
 src/backend/access/nbtree/nbtutils.c  |   2 +-
 4 files changed, 401 insertions(+), 445 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 90a68375a..22bf38934 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -954,6 +954,7 @@ typedef struct BTScanPosData
 
 	XLogRecPtr	lsn;			/* pos in the WAL stream when page was read */
 	BlockNumber currPage;		/* page referenced by items array */
+	BlockNumber prevPage;		/* page's left link when we scanned it */
 	BlockNumber nextPage;		/* page's right link when we scanned it */
 
 	/*
@@ -965,15 +966,6 @@ typedef struct BTScanPosData
 	bool		moreLeft;
 	bool		moreRight;
 
-	/*
-	 * Direction of the scan at the time that _bt_readpage was called.
-	 *
-	 * Used by btrestrpos to "restore" the scan's array keys by resetting each
-	 * array to its first element's value (first in this scan direction). This
-	 * avoids the need to directly track the array keys in btmarkpos.
-	 */
-	ScanDirection dir;
-
 	/*
 	 * If we are doing an index-only scan, nextTupleOffset is the first free
 	 * location in the associated tuple storage workspace.
@@ -1022,6 +1014,7 @@ typedef BTScanPosData *BTScanPos;
 #define BTScanPosInvalidate(scanpos) \
 	do { \
 		(scanpos).currPage = InvalidBlockNumber; \
+		(scanpos).prevPage = InvalidBlockNumber; \
 		(scanpos).nextPage = InvalidBlockNumber; \
 		(scanpos).buf = InvalidBuffer; \
 		(scanpos).lsn = InvalidXLogRecPtr; \
@@ -1073,6 +1066,7 @@ typedef struct BTScanOpaqueData
 	 * markPos.
 	 */
 	int			markItemIndex;	/* itemIndex, or -1 if not valid */
+	ScanDirection markDir;		/* direction of scan with mark */
 
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
@@ -1091,7 +1085,6 @@ typedef struct BTReadPageState
 	OffsetNumber minoff;		/* Lowest non-pivot tuple's offset */
 	OffsetNumber maxoff;		/* Highest non-pivot tuple's offset */
 	IndexTuple	finaltup;		/* Needed by scans with array keys */
-	BlockNumber prev_scan_page; /* previous _bt_parallel_release block */
 	Page		page;			/* Page being read */
 
 	/* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
@@ -1192,12 +1185,14 @@ extern int	btgettreeheight(Relation rel);
 /*
  * prototypes for internal functions in nbtree.c
  */
-extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
-							   bool first);
-extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
+extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
+							   BlockNumber *curr_page, bool first);
+extern void _bt_parallel_release(IndexScanDesc scan,
+								 BlockNumber next_scan_page,
+								 BlockNumber curr_page);
 extern void _bt_parallel_done(IndexScanDesc scan);
 extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
-										   BlockNumber prev_scan_page);
+										   BlockNumber curr_page);
 
 /*
  * prototypes for functions in nbtdedup.c
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 9b968aa0f..ff4fa2cf9 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -66,7 +66,9 @@ typedef enum
  */
 typedef struct BTParallelScanDescData
 {
-	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
+	BlockNumber btps_nextScanPage;	/* next page to be scanned */
+	BlockNumber btps_lastCurrPage;	/* page whose sibling link was copied into
+									 * btps_scanPage */
 	BTPS_State	btps_pageStatus;	/* indicates whether next page is
 									 * available for scan. see above for
 									 * possible states of parallel scan. */
@@ -522,7 +524,7 @@ btrestrpos(IndexScanDesc scan)
 			/* Reset the scan's array keys (see _bt_steppage for why) */
 			if (so->numArrayKeys)
 			{
-				_bt_start_array_keys(scan, so->currPos.dir);
+				_bt_start_array_keys(scan, so->markDir);
 				so->needPrimScan = false;
 			}
 		}
@@ -550,7 +552,8 @@ btinitparallelscan(void *target)
 	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
 
 	SpinLockInit(&bt_target->btps_mutex);
-	bt_target->btps_scanPage = InvalidBlockNumber;
+	bt_target->btps_nextScanPage = InvalidBlockNumber;
+	bt_target->btps_lastCurrPage = InvalidBlockNumber;
 	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 	ConditionVariableInit(&bt_target->btps_cv);
 }
@@ -575,7 +578,8 @@ btparallelrescan(IndexScanDesc scan)
 	 * consistency.
 	 */
 	SpinLockAcquire(&btscan->btps_mutex);
-	btscan->btps_scanPage = InvalidBlockNumber;
+	btscan->btps_nextScanPage = InvalidBlockNumber;
+	btscan->btps_lastCurrPage = InvalidBlockNumber;
 	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 	SpinLockRelease(&btscan->btps_mutex);
 }
@@ -591,18 +595,21 @@ btparallelrescan(IndexScanDesc scan)
  * start just yet (only backends that call from _bt_first are capable of
  * starting primitive index scans, which they indicate by passing first=true).
  *
- * If the return value is true, *pageno returns the next or current page
- * of the scan (depending on the scan direction).  An invalid block number
- * means the scan hasn't yet started, or that caller needs to start the next
- * primitive index scan (if it's the latter case we'll set so.needPrimScan).
- * The first time a participating process reaches the last page, it will return
- * true and set *pageno to P_NONE; after that, further attempts to seize the
- * scan will return false.
+ * If the return value is true, *next_scan_page returns the next page of the
+ * scan, and *curr_page returns the page whose sibling link *next_scan_page
+ * came from.  An invalid *next_scan_page means the scan hasn't yet started,
+ * or that caller needs to start the next primitive index scan (if it's the
+ * latter case we'll set so.needPrimScan).  The first time a participating
+ * process reaches the last page, it will return true and set *next_scan_page
+ * to P_NONE; after that, further attempts to seize the scan will return
+ * false.
  *
- * Callers should ignore the value of pageno if the return value is false.
+ * Callers should ignore the value of next_scan_page and curr_page if the
+ * return value is false.
  */
 bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
+				   BlockNumber *curr_page, bool first)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		exit_loop = false;
@@ -610,7 +617,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;
 
-	*pageno = P_NONE;
+	*next_scan_page = P_NONE;
+	*curr_page = InvalidBlockNumber;
+
+	/*
+	 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
+	 * call to _bt_readnextpage treats this backend similarly to a serial
+	 * backend that steps from *curr_page to *next_scan_page (unless this
+	 * backend's so->currPos is initialized by _bt_readfirstpage before then).
+	 */
+	BTScanPosInvalidate(so->currPos);
+	so->currPos.moreLeft = so->currPos.moreRight = true;
 
 	if (first)
 	{
@@ -660,7 +677,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 					array->cur_elem = btscan->btps_arrElems[i];
 					skey->sk_argument = array->elem_values[array->cur_elem];
 				}
-				*pageno = InvalidBlockNumber;
+				*next_scan_page = InvalidBlockNumber;
 				exit_loop = true;
 			}
 			else
@@ -688,7 +705,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 			 * of advancing it to a new page!
 			 */
 			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
-			*pageno = btscan->btps_scanPage;
+			*next_scan_page = btscan->btps_nextScanPage;
+			*curr_page = btscan->btps_lastCurrPage;
 			exit_loop = true;
 		}
 		SpinLockRelease(&btscan->btps_mutex);
@@ -703,17 +721,20 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 
 /*
  * _bt_parallel_release() -- Complete the process of advancing the scan to a
- *		new page.  We now have the new value btps_scanPage; some other backend
+ *		new page.  We now have the new value btps_nextScanPage; another backend
  *		can now begin advancing the scan.
  *
- * Callers whose scan uses array keys must save their scan_page argument so
+ * Callers whose scan uses array keys must save their curr_page argument so
  * that it can be passed to _bt_parallel_primscan_schedule, should caller
- * determine that another primitive index scan is required.  If that happens,
- * scan_page won't be scanned by any backend (unless the next primitive index
- * scan lands on scan_page).
+ * determine that another primitive index scan is required.
+ *
+ * Note: unlike the serial case, parallel scans don't need to remember both
+ * sibling links.  next_scan_page is whichever link is next given the scan's
+ * direction, since its direction cannot change when parallelism is in use.
  */
 void
-_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+_bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
+					 BlockNumber curr_page)
 {
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;
@@ -722,7 +743,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
 												  parallel_scan->ps_offset);
 
 	SpinLockAcquire(&btscan->btps_mutex);
-	btscan->btps_scanPage = scan_page;
+	btscan->btps_nextScanPage = next_scan_page;
+	btscan->btps_lastCurrPage = curr_page;
 	btscan->btps_pageStatus = BTPARALLEL_IDLE;
 	SpinLockRelease(&btscan->btps_mutex);
 	ConditionVariableSignal(&btscan->btps_cv);
@@ -778,13 +800,13 @@ _bt_parallel_done(IndexScanDesc scan)
 /*
  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
  *
- * Caller passes the block number most recently passed to _bt_parallel_release
+ * Caller passes the curr_page most recently passed to _bt_parallel_release
  * by its backend.  Caller successfully schedules the next primitive index scan
  * if the shared parallel state hasn't been seized since caller's backend last
  * advanced the scan.
  */
 void
-_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -796,10 +818,11 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
 												  parallel_scan->ps_offset);
 
 	SpinLockAcquire(&btscan->btps_mutex);
-	if (btscan->btps_scanPage == prev_scan_page &&
+	if (btscan->btps_lastCurrPage == curr_page &&
 		btscan->btps_pageStatus == BTPARALLEL_IDLE)
 	{
-		btscan->btps_scanPage = InvalidBlockNumber;
+		btscan->btps_nextScanPage = InvalidBlockNumber;
+		btscan->btps_lastCurrPage = InvalidBlockNumber;
 		btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
 
 		/* Serialize scan's current array keys */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 91ac6533f..8912f17df 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -43,12 +43,13 @@ static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
 									   OffsetNumber offnum,
 									   ItemPointer heapTid, int tupleOffset);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
-static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
-static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
-								  ScanDirection dir);
-static Buffer _bt_walk_left(Relation rel, Buffer buf);
+static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
+							  ScanDirection dir);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+							 BlockNumber lastcurrblkno, ScanDirection dir);
+static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
+										 BlockNumber lastcurrblkno);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
-static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
 
 /*
@@ -880,7 +881,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 {
 	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	Buffer		buf;
 	BTStack		stack;
 	OffsetNumber offnum;
 	StrategyNumber strat;
@@ -889,10 +889,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
 	int			keysz = 0;
 	int			i;
-	bool		status;
 	StrategyNumber strat_total;
 	BTScanPosItem *currItem;
-	BlockNumber blkno;
 
 	Assert(!BTScanPosIsValid(so->currPos));
 
@@ -924,7 +922,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (scan->parallel_scan != NULL)
 	{
-		status = _bt_parallel_seize(scan, &blkno, true);
+		BlockNumber blkno,
+					lastcurrblkno;
+		bool		status;
+
+		status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true);
 
 		/*
 		 * Initialize arrays (when _bt_parallel_seize didn't already set up
@@ -942,7 +944,13 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 		else if (blkno != InvalidBlockNumber)
 		{
-			if (!_bt_parallel_readpage(scan, blkno, dir))
+			Assert(!so->needPrimScan);
+
+			/*
+			 * We anticipated starting another primitive scan, but some other
+			 * worker bet us to it
+			 */
+			if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir))
 				return false;
 			goto readcomplete;
 		}
@@ -1392,12 +1400,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * position ourselves on the target leaf page.
 	 */
 	Assert(ScanDirectionIsBackward(dir) == inskey.backward);
-	stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
+	stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
 
 	/* don't need to keep the stack around... */
 	_bt_freestack(stack);
 
-	if (!BufferIsValid(buf))
+	if (!BufferIsValid(so->currPos.buf))
 	{
 		/*
 		 * We only get here if the index is completely empty. Lock relation
@@ -1411,11 +1419,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		if (IsolationIsSerializable())
 		{
 			PredicateLockRelation(rel, scan->xs_snapshot);
-			stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
+			stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
 			_bt_freestack(stack);
 		}
 
-		if (!BufferIsValid(buf))
+		if (!BufferIsValid(so->currPos.buf))
 		{
 			/*
 			 * Mark parallel scan as done, so that all the workers can finish
@@ -1427,17 +1435,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 	}
 
-	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
-
-	_bt_initialize_more_data(so, dir);
-
 	/* position to the precise item on the page */
-	offnum = _bt_binsrch(rel, &inskey, buf);
-	Assert(!BTScanPosIsValid(so->currPos));
-	so->currPos.buf = buf;
+	offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
 
 	/*
-	 * Now load data from the first page of the scan.
+	 * Now load data from the first page of the scan (usually but not always
+	 * the page currently in so->currPos.buf).
 	 *
 	 * If inskey.nextkey = false and inskey.backward = false, offnum is
 	 * positioned at the first non-pivot tuple >= inskey.scankeys.
@@ -1455,21 +1458,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * for the page.  For example, when inskey is both < the leaf page's high
 	 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
 	 */
-	if (!_bt_readpage(scan, dir, offnum, true))
-	{
-		/*
-		 * There's no actually-matching data on this page.  Try to advance to
-		 * the next page.  Return false if there's no matching data at all.
-		 */
-		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
-		if (!_bt_steppage(scan, dir))
-			return false;
-	}
-	else
-	{
-		/* We have at least one item to return as scan's first item */
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
-	}
+	if (!_bt_readfirstpage(scan, offnum, dir))
+		return false;
 
 readcomplete:
 	/* OK, itemIndex says what to return */
@@ -1545,14 +1535,6 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
  * that there can be no more matching tuples in the current scan direction
  * (could just be for the current primitive index scan when scan has arrays).
  *
- * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
- * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
- * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
- * its scan key was marked required), so _bt_first _must_ pass us an offnum
- * exactly at the beginning of where equal tuples are to be found.  When we're
- * passed an offnum past the end of the page, we might still manage to stop
- * the scan on this page by calling _bt_checkkeys against the high key.
- *
  * In the case of a parallel scan, caller must have called _bt_parallel_seize
  * prior to calling this function; this function will invoke
  * _bt_parallel_release before returning.
@@ -1563,6 +1545,7 @@ static bool
 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 			 bool firstPage)
 {
+	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	Page		page;
 	BTPageOpaque opaque;
@@ -1581,19 +1564,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 
 	page = BufferGetPage(so->currPos.buf);
 	opaque = BTPageGetOpaque(page);
+	Assert(!P_IGNORE(opaque));
 
-	/* allow next page be processed by parallel worker */
-	if (scan->parallel_scan)
-	{
-		if (ScanDirectionIsForward(dir))
-			pstate.prev_scan_page = opaque->btpo_next;
-		else
-			pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
-
-		_bt_parallel_release(scan, pstate.prev_scan_page);
-	}
-
-	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+	indnatts = IndexRelationGetNumberOfAttributes(rel);
 	arrayKeys = so->numArrayKeys != 0;
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
@@ -1615,8 +1588,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	/*
 	 * We note the buffer's block number so that we can release the pin later.
 	 * This allows us to re-read the buffer if it is needed again for hinting.
+	 * Also save the page's sibling links; this tells us where to step
+	 * right/left to after we're done reading items from this page.
 	 */
 	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+	so->currPos.prevPage = opaque->btpo_prev;
+	so->currPos.nextPage = opaque->btpo_next;
 
 	/*
 	 * We save the LSN of the page as we read it, so that we know whether it
@@ -1625,13 +1602,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	 */
 	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
 
-	/*
-	 * we must save the page's right-link while scanning it; this tells us
-	 * where to step right to after we're done with these items.  There is no
-	 * corresponding need for the left-link, since splits always go right.
-	 */
-	so->currPos.nextPage = opaque->btpo_next;
-
 	/* initialize tuple workspace to empty */
 	so->currPos.nextTupleOffset = 0;
 
@@ -1641,6 +1611,19 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/* allow next page to be processed by parallel worker */
+	if (scan->parallel_scan)
+	{
+		if (ScanDirectionIsForward(dir))
+			_bt_parallel_release(scan, so->currPos.nextPage,
+								 so->currPos.currPage);
+		else
+			_bt_parallel_release(scan, so->currPos.prevPage,
+								 so->currPos.currPage);
+	}
+
+	PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+
 	/*
 	 * Prechecking the value of the continuescan flag for the last item on the
 	 * page (for backwards scan it will be the first item on a page).  If we
@@ -1829,7 +1812,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
 			int			truncatt;
 
-			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
+			truncatt = BTreeTupleGetNAtts(itup, rel);
 			pstate.prechecked = false;	/* precheck didn't cover HIKEY */
 			_bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
 		}
@@ -1959,7 +1942,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		 * We don't need to visit page to the left when no more matches will
 		 * be found there
 		 */
-		if (!pstate.continuescan || P_LEFTMOST(opaque))
+		if (!pstate.continuescan)
 			so->currPos.moreLeft = false;
 
 		Assert(itemIndex >= 0);
@@ -2060,20 +2043,25 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 /*
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
- * if pinned, we'll drop the pin before moving to next page.  The buffer is
- * not locked on entry.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
+ * The scan must have a valid so->currPos position in any case.  If there's no
+ * pin held, then that must be because _bt_drop_lock_and_maybe_pin dropped the
+ * pin early as an optimization.
  *
- * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
- * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
- * to InvalidBuffer.  We return true to indicate success.
+ * This is a wrapper on _bt_readnextpage that performs final steps for the
+ * current page.  It sets up the _bt_readnextpage call using either local
+ * state saved in so->currPos by the most recent _bt_readpage call, or using
+ * shared parallel scan state (obtained by seizing the parallel scan here).
+ *
+ * Parallel scan callers that have already seized the scan should directly
+ * call _bt_readnextpage, rather than calling here.
  */
 static bool
 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	BlockNumber blkno = InvalidBlockNumber;
-	bool		status;
+	BlockNumber blkno,
+				lastcurrblkno;
 
 	Assert(BTScanPosIsValid(so->currPos));
 
@@ -2098,6 +2086,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 				   so->currPos.nextTupleOffset);
 		so->markPos.itemIndex = so->markItemIndex;
 		so->markItemIndex = -1;
+		so->markDir = dir;
 
 		/*
 		 * If we're just about to start the next primitive index scan
@@ -2115,7 +2104,6 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		 * In effect, btrestpos leaves advancing the arrays up to the first
 		 * _bt_readpage call (that takes place after it has restored markPos).
 		 */
-		Assert(so->markPos.dir == dir);
 		if (so->needPrimScan)
 		{
 			if (ScanDirectionIsForward(dir))
@@ -2123,319 +2111,302 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 			else
 				so->markPos.moreLeft = true;
 		}
+
+		/* mark/restore not supported by parallel scans */
+		Assert(!scan->parallel_scan);
 	}
 
-	if (ScanDirectionIsForward(dir))
+	/* release the pin when _bt_drop_lock_and_maybe_pin couldn't earlier on */
+	BTScanPosUnpinIfPinned(so->currPos);
+
+	/* Walk to the next page with data */
+	if (!scan->parallel_scan)
 	{
-		/* Walk right to the next page with data */
-		if (scan->parallel_scan != NULL)
-		{
-			/*
-			 * Seize the scan to get the next block number; if the scan has
-			 * ended already, bail out.
-			 */
-			status = _bt_parallel_seize(scan, &blkno, false);
-			if (!status)
-			{
-				/* release the previous buffer, if pinned */
-				BTScanPosUnpinIfPinned(so->currPos);
-				BTScanPosInvalidate(so->currPos);
-				return false;
-			}
-		}
-		else
-		{
-			/* Not parallel, so use the previously-saved nextPage link. */
+		/* Not parallel, so use local state set by the last _bt_readpage */
+		if (ScanDirectionIsForward(dir))
 			blkno = so->currPos.nextPage;
-		}
-
-		/* Remember we left a page with data */
-		so->currPos.moreLeft = true;
-
-		/* release the previous buffer, if pinned */
-		BTScanPosUnpinIfPinned(so->currPos);
-	}
-	else
-	{
-		/* Remember we left a page with data */
-		so->currPos.moreRight = true;
-
-		if (scan->parallel_scan != NULL)
-		{
-			/*
-			 * Seize the scan to get the current block number; if the scan has
-			 * ended already, bail out.
-			 */
-			status = _bt_parallel_seize(scan, &blkno, false);
-			BTScanPosUnpinIfPinned(so->currPos);
-			if (!status)
-			{
-				BTScanPosInvalidate(so->currPos);
-				return false;
-			}
-		}
 		else
-		{
-			/* Not parallel, so just use our own notion of the current page */
-			blkno = so->currPos.currPage;
-		}
-	}
-
-	if (!_bt_readnextpage(scan, blkno, dir))
-		return false;
-
-	/* We have at least one item to return as scan's next item */
-	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
-
-	return true;
-}
-
-/*
- *	_bt_readnextpage() -- Read next page containing valid data for scan
- *
- * On success exit, so->currPos is updated to contain data from the next
- * interesting page, and we return true.  Caller must release the lock (and
- * maybe the pin) on the buffer on success exit.
- *
- * If there are no more matching records in the given direction, we drop all
- * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
- */
-static bool
-_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
-{
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	Relation	rel;
-	Page		page;
-	BTPageOpaque opaque;
-	bool		status;
-
-	rel = scan->indexRelation;
-
-	if (ScanDirectionIsForward(dir))
-	{
-		for (;;)
-		{
-			/*
-			 * if we're at end of scan, give up and mark parallel scan as
-			 * done, so that all the workers can finish their scan
-			 */
-			if (blkno == P_NONE || !so->currPos.moreRight)
-			{
-				_bt_parallel_done(scan);
-				BTScanPosInvalidate(so->currPos);
-				return false;
-			}
-			/* check for interrupts while we're not holding any buffer lock */
-			CHECK_FOR_INTERRUPTS();
-			/* step right one page */
-			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
-			page = BufferGetPage(so->currPos.buf);
-			opaque = BTPageGetOpaque(page);
-			/* check for deleted page */
-			if (!P_IGNORE(opaque))
-			{
-				PredicateLockPage(rel, blkno, scan->xs_snapshot);
-				/* see if there are any matches on this page */
-				/* note that this will clear moreRight if we can stop */
-				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
-					break;
-			}
-			else if (scan->parallel_scan != NULL)
-			{
-				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, opaque->btpo_next);
-			}
-
-			/* nope, keep going */
-			if (scan->parallel_scan != NULL)
-			{
-				_bt_relbuf(rel, so->currPos.buf);
-				status = _bt_parallel_seize(scan, &blkno, false);
-				if (!status)
-				{
-					BTScanPosInvalidate(so->currPos);
-					return false;
-				}
-			}
-			else
-			{
-				blkno = opaque->btpo_next;
-				_bt_relbuf(rel, so->currPos.buf);
-			}
-		}
+			blkno = so->currPos.prevPage;
+		lastcurrblkno = so->currPos.currPage;
 	}
 	else
 	{
 		/*
-		 * Should only happen in parallel cases, when some other backend
-		 * advanced the scan.
+		 * Seize the scan to get the nextPage and currPage from shared
+		 * parallel state (saved from parallel scan's last _bt_readpage)
 		 */
-		if (so->currPos.currPage != blkno)
-		{
-			BTScanPosUnpinIfPinned(so->currPos);
-			so->currPos.currPage = blkno;
-		}
+		if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+			return false;
+	}
 
-		/* Done if we know that the left sibling link isn't of interest */
-		if (!so->currPos.moreLeft)
+	return _bt_readnextpage(scan, blkno, lastcurrblkno, dir);
+}
+
+/*
+ *	_bt_readfirstpage() -- Read first page containing valid data for _bt_first
+ *
+ * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
+ * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
+ * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
+ * its scan key was marked required), so _bt_first _must_ pass us an offnum
+ * exactly at the beginning of where equal tuples are to be found.  When we're
+ * passed an offnum past the end of the page, we might still manage to stop
+ * the scan on this page by calling _bt_checkkeys against the high key.  See
+ * _bt_readpage for full details.
+ *
+ * On entry, so->currPos must be pinned and locked (so offnum stays valid).
+ * Parallel scan callers must have seized the scan before calling here.
+ *
+ * On exit, we'll have updated so->currPos and retained locks and pins
+ * according to the same rules as those laid out for _bt_readnextpage exit.
+ * Like _bt_readnextpage, our return value indicates if there are any matching
+ * records in the given direction.
+ *
+ * We always release the scan for a parallel scan caller, regardless of
+ * success or failure; we'll call _bt_parallel_release as soon as possible.
+ */
+static bool
+_bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	Assert(BufferIsValid(so->currPos.buf));
+
+	so->numKilled = 0;			/* just paranoia */
+	so->markItemIndex = -1;		/* ditto */
+
+	/* Initialize currPos for so->currPos */
+	if (so->needPrimScan)
+	{
+		Assert(so->numArrayKeys);
+
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = true;
+		so->needPrimScan = false;
+	}
+	else if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+
+	/*
+	 * Attempt to load matching tuples from the page in so->currPos.buf.
+	 *
+	 * Note that _bt_readpage will finish initializing the so->currPos fields.
+	 * _bt_readpage also releases parallel scan (even when it returns false).
+	 */
+	if (_bt_readpage(scan, dir, offnum, true))
+	{
+		/*
+		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
+		 * so->currPos.buf in preparation for btgettuple returning tuples.
+		 */
+		Assert(BTScanPosIsPinned(so->currPos));
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+		return true;
+	}
+
+	/* There's no actually-matching data on the page in so->currPos.buf */
+	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+
+	/* Call _bt_readnextpage using its _bt_steppage wrapper function */
+	if (!_bt_steppage(scan, dir))
+		return false;
+
+	/* _bt_readpage for a later page (now in so->currPos) succeeded */
+	return true;
+}
+
+/*
+ *	_bt_readnextpage() -- Read next page containing valid data for _bt_next
+ *
+ * Caller's blkno is the next interesting page's link, taken from either the
+ * previously-saved right link or left link.  lastcurrblkno is the page that
+ * was current at the point where the blkno link was saved, which we use to
+ * reason about concurrent page splits/page deletions during backwards scans
+ * (_bt_parallel_seize also requires it, regardless of scan direction).
+ *
+ * On entry, caller shouldn't hold any locks or pins on any page (we work
+ * directly off of blkno and lastcurrblkno instead).  Parallel scan callers
+ * must have seized the scan before calling here (blkno and lastcurrblkno
+ * arguments should come from the seized scan).
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page, and we return true.  We hold a pin on the buffer on
+ * success exit, except for scans that _bt_drop_lock_and_maybe_pin indicates
+ * can safely have their pin dropped to avoid blocking progress by VACUUM.
+ *
+ * If there are no more matching records in the given direction, we drop all
+ * locks and pins, and return false.
+ *
+ * We always release the scan for a parallel scan caller, regardless of
+ * success or failure; we'll call _bt_parallel_release as soon as possible.
+ */
+static bool
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+				 BlockNumber lastcurrblkno, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Page		page;
+	BTPageOpaque opaque;
+
+	/*
+	 * Invalidate currPos, to make it safe for us to return 'false'.
+	 *
+	 * Note: this won't unset moreLeft and moreRight from lastcurrblkno.
+	 */
+	BTScanPosInvalidate(so->currPos);
+
+	/*
+	 * Prepare for blkno to become so->currPos.currPage in _bt_readpage.
+	 *
+	 * We know that the scan just read a page (lastcurrblkno), so remember
+	 * that we left a place with potentially matching tuples to blkno's left
+	 * (or to blkno's right, when we're scanning backwards).
+	 *
+	 * Note: we deliberately avoid updating moreRight (or moreLeft, during
+	 * backwards scans) at this point.  When there's no more data to the right
+	 * of lastcurrblkno (during a forward scan), then clearly there also can't
+	 * be any more data to the right of its blkno right sibling page now.
+	 * (During backwards scans there can't be any more data to the left of its
+	 * blkno left sibling page.)
+	 */
+	if (ScanDirectionIsForward(dir))
+		so->currPos.moreLeft = true;
+	else
+		so->currPos.moreRight = true;
+
+	for (;;)
+	{
+		/*
+		 * if we're at end of scan, give up and mark parallel scan as done, so
+		 * that all the workers can finish their scan
+		 */
+		if (blkno == P_NONE ||
+			(ScanDirectionIsForward(dir) ?
+			 !so->currPos.moreRight : !so->currPos.moreLeft))
 		{
-			BTScanPosUnpinIfPinned(so->currPos);
 			_bt_parallel_done(scan);
-			BTScanPosInvalidate(so->currPos);
 			return false;
 		}
 
-		/*
-		 * Walk left to the next page with data.  This is much more complex
-		 * than the walk-right case because of the possibility that the page
-		 * to our left splits while we are in flight to it, plus the
-		 * possibility that the page we were on gets deleted after we leave
-		 * it.  See nbtree/README for details.
-		 *
-		 * It might be possible to rearrange this code to have less overhead
-		 * in pinning and locking, but that would require capturing the left
-		 * sibling block number when the page is initially read, and then
-		 * optimistically starting there (rather than pinning the page twice).
-		 * It is not clear that this would be worth the complexity.
-		 */
-		if (BTScanPosIsPinned(so->currPos))
-			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
-		else
-			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
-
-		for (;;)
+		if (ScanDirectionIsForward(dir))
 		{
-			/* Done if we know that the left sibling link isn't of interest */
-			if (!so->currPos.moreLeft)
-			{
-				_bt_relbuf(rel, so->currPos.buf);
-				_bt_parallel_done(scan);
-				BTScanPosInvalidate(so->currPos);
-				return false;
-			}
-
-			/* Step to next physical page */
-			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
-
-			/* if we're physically at end of index, return failure */
+			/* read blkno, but check for interrupts first */
+			CHECK_FOR_INTERRUPTS();
+			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+		}
+		else
+		{
+			/* read blkno, avoiding race (also checks for interrupts) */
+			so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
+														 lastcurrblkno);
 			if (so->currPos.buf == InvalidBuffer)
 			{
+				/* must have been a concurrent deletion of leftmost page */
 				_bt_parallel_done(scan);
-				BTScanPosInvalidate(so->currPos);
 				return false;
 			}
+		}
 
-			/*
-			 * Okay, we managed to move left to a non-deleted page. Done if
-			 * it's not half-dead and contains matching tuples. Else loop back
-			 * and do it all again.
-			 */
-			page = BufferGetPage(so->currPos.buf);
-			opaque = BTPageGetOpaque(page);
-			if (!P_IGNORE(opaque))
+		page = BufferGetPage(so->currPos.buf);
+		opaque = BTPageGetOpaque(page);
+		lastcurrblkno = blkno;
+		if (!P_IGNORE(opaque))
+		{
+			/* see if there are any matches on this page */
+			if (ScanDirectionIsForward(dir))
+			{
+				/* note that this will clear moreRight if we can stop */
+				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
+					break;
+				blkno = so->currPos.nextPage;
+			}
+			else
 			{
-				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
-				/* see if there are any matches on this page */
 				/* note that this will clear moreLeft if we can stop */
 				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
 					break;
-			}
-			else if (scan->parallel_scan != NULL)
-			{
-				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
-			}
-
-			/*
-			 * For parallel scans, get the last page scanned as it is quite
-			 * possible that by the time we try to seize the scan, some other
-			 * worker has already advanced the scan to a different page.  We
-			 * must continue based on the latest page scanned by any worker.
-			 */
-			if (scan->parallel_scan != NULL)
-			{
-				_bt_relbuf(rel, so->currPos.buf);
-				status = _bt_parallel_seize(scan, &blkno, false);
-				if (!status)
-				{
-					BTScanPosInvalidate(so->currPos);
-					return false;
-				}
-				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+				blkno = so->currPos.prevPage;
 			}
 		}
+		else
+		{
+			/* _bt_readpage not called, so do all this for ourselves */
+			if (ScanDirectionIsForward(dir))
+				blkno = opaque->btpo_next;
+			else
+				blkno = opaque->btpo_prev;
+			if (scan->parallel_scan)
+				_bt_parallel_release(scan, blkno, lastcurrblkno);
+		}
+
+		/* no matching tuples on this page */
+		_bt_relbuf(rel, so->currPos.buf);
+
+		/* working off of new blkno now */
+		BTScanPosInvalidate(so->currPos);
+
+		/* parallel scan seizes another page (don't use so->currPos blkno) */
+		if (scan->parallel_scan &&
+			!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
+			return false;
 	}
 
-	return true;
-}
-
-/*
- *	_bt_parallel_readpage() -- Read current page containing valid data for scan
- *
- * On success, release lock and maybe pin on buffer.  We return true to
- * indicate success.
- */
-static bool
-_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
-{
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-
-	Assert(!so->needPrimScan);
-
-	_bt_initialize_more_data(so, dir);
-
-	if (!_bt_readnextpage(scan, blkno, dir))
-		return false;
-
-	/* We have at least one item to return as scan's next item */
+	/*
+	 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
+	 * so->currPos.buf in preparation for btgettuple returning tuples.
+	 */
+	Assert(so->currPos.currPage == blkno);
+	Assert(BTScanPosIsPinned(so->currPos));
 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 
 	return true;
 }
 
 /*
- * _bt_walk_left() -- step left one page, if possible
+ * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
+ * recovering from concurrent page splits/page deletions when necessary
  *
- * The given buffer must be pinned and read-locked.  This will be dropped
- * before stepping left.  On return, we have pin and read lock on the
- * returned page, instead.
+ * Called during backwards scans, to deal with their unique concurrency rules.
  *
- * Returns InvalidBuffer if there is no page to the left (no lock is held
- * in that case).
+ * blkno points to the block number of the page that we expect to move the
+ * scan to.  We'll successfully move the scan there when we find that its
+ * right sibling link points to lastcurrblkno, the page that we just finished
+ * reading.  Otherwise, we have to figure out which page is the correct one to
+ * visit next the hard way, which involves reasoning about both concurrent
+ * page splits and concurrent page deletions.  See nbtree/README for details.
+ *
+ * On return, we have both a pin and a read lock on the returned page, whose
+ * block number will be set in *blkno.  Returns InvalidBuffer if there is no
+ * page to the left (no lock is held in that case).
  *
  * It is possible for the returned leaf page to be half-dead; caller must
  * check that condition and step left again when required.
  */
 static Buffer
-_bt_walk_left(Relation rel, Buffer buf)
+_bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
+						   BlockNumber lastcurrblkno)
 {
-	Page		page;
-	BTPageOpaque opaque;
-
-	page = BufferGetPage(buf);
-	opaque = BTPageGetOpaque(page);
+	BlockNumber origblkno = *blkno; /* detects circular links */
 
 	for (;;)
 	{
-		BlockNumber obknum;
-		BlockNumber lblkno;
-		BlockNumber blkno;
+		Buffer		buf;
+		Page		page;
+		BTPageOpaque opaque;
 		int			tries;
 
-		/* if we're at end of tree, release buf and return failure */
-		if (P_LEFTMOST(opaque))
-		{
-			_bt_relbuf(rel, buf);
-			break;
-		}
-		/* remember original page we are stepping left from */
-		obknum = BufferGetBlockNumber(buf);
-		/* step left */
-		blkno = lblkno = opaque->btpo_prev;
-		_bt_relbuf(rel, buf);
 		/* check for interrupts while we're not holding any buffer lock */
 		CHECK_FOR_INTERRUPTS();
-		buf = _bt_getbuf(rel, blkno, BT_READ);
+		buf = _bt_getbuf(rel, *blkno, BT_READ);
 		page = BufferGetPage(buf);
 		opaque = BTPageGetOpaque(page);
 
@@ -2452,21 +2423,26 @@ _bt_walk_left(Relation rel, Buffer buf)
 		tries = 0;
 		for (;;)
 		{
-			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+			if (!P_ISDELETED(opaque) && opaque->btpo_next == lastcurrblkno)
 			{
 				/* Found desired page, return it */
 				return buf;
 			}
 			if (P_RIGHTMOST(opaque) || ++tries > 4)
 				break;
-			blkno = opaque->btpo_next;
-			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+			/* step right */
+			*blkno = opaque->btpo_next;
+			buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
 			page = BufferGetPage(buf);
 			opaque = BTPageGetOpaque(page);
 		}
 
-		/* Return to the original page to see what's up */
-		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+		/*
+		 * Return to the original page (usually the page most recently read by
+		 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
+		 * what's up with sibling links
+		 */
+		buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
 		page = BufferGetPage(buf);
 		opaque = BTPageGetOpaque(page);
 		if (P_ISDELETED(opaque))
@@ -2482,8 +2458,8 @@ _bt_walk_left(Relation rel, Buffer buf)
 				if (P_RIGHTMOST(opaque))
 					elog(ERROR, "fell off the end of index \"%s\"",
 						 RelationGetRelationName(rel));
-				blkno = opaque->btpo_next;
-				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+				*blkno = opaque->btpo_next;
+				buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
 				page = BufferGetPage(buf);
 				opaque = BTPageGetOpaque(page);
 				if (!P_ISDELETED(opaque))
@@ -2491,9 +2467,10 @@ _bt_walk_left(Relation rel, Buffer buf)
 			}
 
 			/*
-			 * Now return to top of loop, resetting obknum to point to this
-			 * nondeleted page, and try again.
+			 * remember that this page is actually now the leftmost one whose
+			 * key space we've already read all tuples from
 			 */
+			lastcurrblkno = BufferGetBlockNumber(buf);
 		}
 		else
 		{
@@ -2502,11 +2479,22 @@ _bt_walk_left(Relation rel, Buffer buf)
 			 * to the left got split or deleted. Without this check, we'd go
 			 * into an infinite loop if there's anything wrong.
 			 */
-			if (opaque->btpo_prev == lblkno)
+			if (opaque->btpo_prev == origblkno)
 				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
-					 obknum, RelationGetRelationName(rel));
-			/* Okay to try again with new lblkno value */
+					 lastcurrblkno, RelationGetRelationName(rel));
+			/* Okay to try again, since left sibling link changed */
 		}
+
+		if (P_LEFTMOST(opaque))
+		{
+			/* previous leftmost page must have been concurrently deleted */
+			_bt_relbuf(rel, buf);
+			break;
+		}
+
+		/* next loop iteration will start over with a clean slate */
+		*blkno = origblkno = opaque->btpo_prev;
+		_bt_relbuf(rel, buf);
 	}
 
 	return InvalidBuffer;
@@ -2606,7 +2594,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 {
 	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	Buffer		buf;
 	Page		page;
 	BTPageOpaque opaque;
 	OffsetNumber start;
@@ -2617,9 +2604,9 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	 * version of _bt_search().  We don't maintain a stack since we know we
 	 * won't need it.
 	 */
-	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
+	so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
 
-	if (!BufferIsValid(buf))
+	if (!BufferIsValid(so->currPos.buf))
 	{
 		/*
 		 * Empty index. Lock the whole relation, as nothing finer to lock
@@ -2630,8 +2617,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		return false;
 	}
 
-	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
-	page = BufferGetPage(buf);
+	page = BufferGetPage(so->currPos.buf);
 	opaque = BTPageGetOpaque(page);
 	Assert(P_ISLEAF(opaque));
 
@@ -2654,29 +2640,11 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		start = 0;				/* keep compiler quiet */
 	}
 
-	/* remember which buffer we have pinned */
-	so->currPos.buf = buf;
-
-	_bt_initialize_more_data(so, dir);
-
 	/*
 	 * Now load data from the first page of the scan.
 	 */
-	if (!_bt_readpage(scan, dir, start, true))
-	{
-		/*
-		 * There's no actually-matching data on this page.  Try to advance to
-		 * the next page.  Return false if there's no matching data at all.
-		 */
-		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
-		if (!_bt_steppage(scan, dir))
-			return false;
-	}
-	else
-	{
-		/* We have at least one item to return as scan's first item */
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
-	}
+	if (!_bt_readfirstpage(scan, start, dir))
+		return false;
 
 	/* OK, itemIndex says what to return */
 	currItem = &so->currPos.items[so->currPos.itemIndex];
@@ -2686,33 +2654,3 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 
 	return true;
 }
-
-/*
- * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
- * from currPos
- */
-static inline void
-_bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
-{
-	so->currPos.dir = dir;
-	if (so->needPrimScan)
-	{
-		Assert(so->numArrayKeys);
-
-		so->currPos.moreLeft = true;
-		so->currPos.moreRight = true;
-		so->needPrimScan = false;
-	}
-	else if (ScanDirectionIsForward(dir))
-	{
-		so->currPos.moreLeft = false;
-		so->currPos.moreRight = true;
-	}
-	else
-	{
-		so->currPos.moreLeft = true;
-		so->currPos.moreRight = false;
-	}
-	so->numKilled = 0;			/* just paranoia */
-	so->markItemIndex = -1;		/* ditto */
-}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 61ded00ca..76be65123 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2407,7 +2407,7 @@ new_prim_scan:
 	so->needPrimScan = true;	/* ...but call _bt_first again */
 
 	if (scan->parallel_scan)
-		_bt_parallel_primscan_schedule(scan, pstate->prev_scan_page);
+		_bt_parallel_primscan_schedule(scan, so->currPos.currPage);
 
 	/* Caller's tuple doesn't match the new qual */
 	return false;
-- 
2.45.2

Reply via email to