From d864b395f0c0012312d518c28787ae40dcdbe5b1 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 8 Oct 2024 11:40:54 -0400
Subject: [PATCH v4] Optimize and simplify nbtree backwards scans.

---
 src/include/access/nbtree.h           |  11 +-
 src/backend/access/nbtree/nbtree.c    |  40 ++-
 src/backend/access/nbtree/nbtsearch.c | 463 +++++++++++++-------------
 src/backend/access/nbtree/nbtutils.c  |   2 +-
 4 files changed, 267 insertions(+), 249 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d64300fb9..c082bdf64 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -954,6 +954,7 @@ typedef struct BTScanPosData
 
 	XLogRecPtr	lsn;			/* pos in the WAL stream when page was read */
 	BlockNumber currPage;		/* page referenced by items array */
+	BlockNumber prevPage;		/* page's left link when we scanned it */
 	BlockNumber nextPage;		/* page's right link when we scanned it */
 
 	/*
@@ -1022,6 +1023,7 @@ typedef BTScanPosData *BTScanPos;
 #define BTScanPosInvalidate(scanpos) \
 	do { \
 		(scanpos).currPage = InvalidBlockNumber; \
+		(scanpos).prevPage = InvalidBlockNumber; \
 		(scanpos).nextPage = InvalidBlockNumber; \
 		(scanpos).buf = InvalidBuffer; \
 		(scanpos).lsn = InvalidXLogRecPtr; \
@@ -1090,7 +1092,6 @@ typedef struct BTReadPageState
 	OffsetNumber minoff;		/* Lowest non-pivot tuple's offset */
 	OffsetNumber maxoff;		/* Highest non-pivot tuple's offset */
 	IndexTuple	finaltup;		/* Needed by scans with array keys */
-	BlockNumber prev_scan_page; /* previous _bt_parallel_release block */
 	Page		page;			/* Page being read */
 
 	/* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
@@ -1192,11 +1193,13 @@ extern int	btgettreeheight(Relation rel);
  * prototypes for internal functions in nbtree.c
  */
 extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
-							   bool first);
-extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
+							   BlockNumber *currblkno, bool first);
+extern void _bt_parallel_release(IndexScanDesc scan,
+								 BlockNumber next_scan_page,
+								 BlockNumber curr_page);
 extern void _bt_parallel_done(IndexScanDesc scan);
 extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
-										   BlockNumber prev_scan_page);
+										   BlockNumber curr_page);
 
 /*
  * prototypes for functions in nbtdedup.c
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 56e502c4f..b926b99ba 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -66,7 +66,9 @@ typedef enum
  */
 typedef struct BTParallelScanDescData
 {
-	BlockNumber btps_scanPage;	/* latest or next page to be scanned */
+	BlockNumber btps_nextScanPage;	/* next page to be scanned */
+	BlockNumber btps_lastCurrPage;	/* page whose sibling link was copied into
+									 * btps_scanPage */
 	BTPS_State	btps_pageStatus;	/* indicates whether next page is
 									 * available for scan. see above for
 									 * possible states of parallel scan. */
@@ -548,7 +550,8 @@ btinitparallelscan(void *target)
 	BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
 
 	SpinLockInit(&bt_target->btps_mutex);
-	bt_target->btps_scanPage = InvalidBlockNumber;
+	bt_target->btps_nextScanPage = InvalidBlockNumber;
+	bt_target->btps_lastCurrPage = InvalidBlockNumber;
 	bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 	ConditionVariableInit(&bt_target->btps_cv);
 }
@@ -573,7 +576,8 @@ btparallelrescan(IndexScanDesc scan)
 	 * consistency.
 	 */
 	SpinLockAcquire(&btscan->btps_mutex);
-	btscan->btps_scanPage = InvalidBlockNumber;
+	btscan->btps_nextScanPage = InvalidBlockNumber;
+	btscan->btps_lastCurrPage = InvalidBlockNumber;
 	btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
 	SpinLockRelease(&btscan->btps_mutex);
 }
@@ -600,7 +604,8 @@ btparallelrescan(IndexScanDesc scan)
  * Callers should ignore the value of pageno if the return value is false.
  */
 bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno,
+				   BlockNumber *currblkno, bool first)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		exit_loop = false;
@@ -609,6 +614,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 	BTParallelScanDesc btscan;
 
 	*pageno = P_NONE;
+	*currblkno = InvalidBlockNumber;
 
 	if (first)
 	{
@@ -684,7 +690,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 			 * of advancing it to a new page!
 			 */
 			btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
-			*pageno = btscan->btps_scanPage;
+			*pageno = btscan->btps_nextScanPage;
+			*currblkno = btscan->btps_lastCurrPage;
 			exit_loop = true;
 		}
 		SpinLockRelease(&btscan->btps_mutex);
@@ -699,17 +706,22 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 
 /*
  * _bt_parallel_release() -- Complete the process of advancing the scan to a
- *		new page.  We now have the new value btps_scanPage; some other backend
+ *		new page.  We now have the new value btps_nextScanPage; another backend
  *		can now begin advancing the scan.
  *
- * Callers whose scan uses array keys must save their scan_page argument so
+ * Callers whose scan uses array keys must save their curr_page argument so
  * that it can be passed to _bt_parallel_primscan_schedule, should caller
  * determine that another primitive index scan is required.  If that happens,
  * scan_page won't be scanned by any backend (unless the next primitive index
  * scan lands on scan_page).
+ *
+ * Note: unlike the serial case, parallel scans don't need to remember both
+ * sibling links.  Caller can just pass us whichever link is next for the
+ * current scan direction because it cannot change within a parallel scan.
  */
 void
-_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
+_bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
+					 BlockNumber curr_page)
 {
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 	BTParallelScanDesc btscan;
@@ -718,7 +730,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
 												  parallel_scan->ps_offset);
 
 	SpinLockAcquire(&btscan->btps_mutex);
-	btscan->btps_scanPage = scan_page;
+	btscan->btps_nextScanPage = next_scan_page;
+	btscan->btps_lastCurrPage = curr_page;
 	btscan->btps_pageStatus = BTPARALLEL_IDLE;
 	SpinLockRelease(&btscan->btps_mutex);
 	ConditionVariableSignal(&btscan->btps_cv);
@@ -774,13 +787,13 @@ _bt_parallel_done(IndexScanDesc scan)
 /*
  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
  *
- * Caller passes the block number most recently passed to _bt_parallel_release
+ * Caller passes the curr_page most recently passed to _bt_parallel_release
  * by its backend.  Caller successfully schedules the next primitive index scan
  * if the shared parallel state hasn't been seized since caller's backend last
  * advanced the scan.
  */
 void
-_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -792,10 +805,11 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
 												  parallel_scan->ps_offset);
 
 	SpinLockAcquire(&btscan->btps_mutex);
-	if (btscan->btps_scanPage == prev_scan_page &&
+	if (btscan->btps_lastCurrPage == curr_page &&
 		btscan->btps_pageStatus == BTPARALLEL_IDLE)
 	{
-		btscan->btps_scanPage = InvalidBlockNumber;
+		btscan->btps_nextScanPage = InvalidBlockNumber;
+		btscan->btps_lastCurrPage = InvalidBlockNumber;
 		btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
 
 		/* Serialize scan's current array keys */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fff7c89ea..047c891d8 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -43,10 +43,12 @@ static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
 									   OffsetNumber offnum,
 									   ItemPointer heapTid, int tupleOffset);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
-static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
-static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
-								  ScanDirection dir);
-static Buffer _bt_walk_left(Relation rel, Buffer buf);
+static bool _bt_readfirstpage(IndexScanDesc scan, ScanDirection dir,
+							  OffsetNumber offnum);
+static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+							 BlockNumber lastcurrblkno, ScanDirection dir);
+static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
+										 BlockNumber lastcurrblkno);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
@@ -924,7 +926,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (scan->parallel_scan != NULL)
 	{
-		status = _bt_parallel_seize(scan, &blkno, true);
+		BlockNumber lastcurrblkno;
+
+		status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true);
 
 		/*
 		 * Initialize arrays (when _bt_parallel_seize didn't already set up
@@ -942,7 +946,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 		else if (blkno != InvalidBlockNumber)
 		{
-			if (!_bt_parallel_readpage(scan, blkno, dir))
+			Assert(!so->needPrimScan);
+
+			_bt_initialize_more_data(so, dir);
+
+			if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir))
 				return false;
 			goto readcomplete;
 		}
@@ -1427,8 +1435,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 	}
 
-	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
-
 	_bt_initialize_more_data(so, dir);
 
 	/* position to the precise item on the page */
@@ -1455,21 +1461,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * for the page.  For example, when inskey is both < the leaf page's high
 	 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
 	 */
-	if (!_bt_readpage(scan, dir, offnum, true))
-	{
-		/*
-		 * There's no actually-matching data on this page.  Try to advance to
-		 * the next page.  Return false if there's no matching data at all.
-		 */
-		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
-		if (!_bt_steppage(scan, dir))
-			return false;
-	}
-	else
-	{
-		/* We have at least one item to return as scan's first item */
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
-	}
+	if (!_bt_readfirstpage(scan, dir, offnum))
+		return false;
 
 readcomplete:
 	/* OK, itemIndex says what to return */
@@ -1582,17 +1575,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	page = BufferGetPage(so->currPos.buf);
 	opaque = BTPageGetOpaque(page);
 
-	/* allow next page be processed by parallel worker */
-	if (scan->parallel_scan)
-	{
-		if (ScanDirectionIsForward(dir))
-			pstate.prev_scan_page = opaque->btpo_next;
-		else
-			pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
-
-		_bt_parallel_release(scan, pstate.prev_scan_page);
-	}
-
 	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
 	arrayKeys = so->numArrayKeys != 0;
 	minoff = P_FIRSTDATAKEY(opaque);
@@ -1626,10 +1608,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
 
 	/*
-	 * we must save the page's right-link while scanning it; this tells us
-	 * where to step right to after we're done with these items.  There is no
-	 * corresponding need for the left-link, since splits always go right.
+	 * we must save the page's right-link and left-link; this tells us where
+	 * to step right/left to after we're done with these items
 	 */
+	so->currPos.prevPage = opaque->btpo_prev;
 	so->currPos.nextPage = opaque->btpo_next;
 
 	/* initialize tuple workspace to empty */
@@ -1640,6 +1622,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	 * good.
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
+	Assert(!P_IGNORE(opaque));
+
+	/* allow next page to be processed by parallel worker */
+	if (scan->parallel_scan)
+	{
+		if (ScanDirectionIsForward(dir))
+			_bt_parallel_release(scan, so->currPos.nextPage,
+								 so->currPos.currPage);
+		else
+			_bt_parallel_release(scan, so->currPos.prevPage,
+								 so->currPos.currPage);
+	}
 
 	/*
 	 * Prechecking the value of the continuescan flag for the last item on the
@@ -1934,7 +1928,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 		 * We don't need to visit page to the left when no more matches will
 		 * be found there
 		 */
-		if (!pstate.continuescan || P_LEFTMOST(opaque))
+		if (!pstate.continuescan)
 			so->currPos.moreLeft = false;
 
 		Assert(itemIndex >= 0);
@@ -2035,19 +2029,21 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 /*
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
- * if pinned, we'll drop the pin before moving to next page.  The buffer is
- * not locked on entry.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
+ * This is a wrapper on _bt_readnextpage that performs final steps for the
+ * current page (including dropping its pin).  It sets up the _bt_readnextpage
+ * call using either local state saved by the last _bt_readpage call, or using
+ * shared state consumed by seizing the parallel scan.
  *
- * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
- * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
- * to InvalidBuffer.  We return true to indicate success.
+ * Parallel scan callers that have already seized the scan should directly
+ * call _bt_readnextpage, rather than calling here.
  */
 static bool
 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	BlockNumber blkno = InvalidBlockNumber;
+	BlockNumber lastcurrblkno;
 	bool		status;
 
 	Assert(BTScanPosIsValid(so->currPos));
@@ -2098,85 +2094,139 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 			else
 				so->markPos.moreLeft = true;
 		}
+
+		/* mark/restore not supported by parallel scans */
+		Assert(!scan->parallel_scan);
 	}
 
-	if (ScanDirectionIsForward(dir))
+	/* release the previously pinned leaf page buffer, if any */
+	BTScanPosUnpinIfPinned(so->currPos);
+
+	/* Walk to the next page with data */
+	if (!scan->parallel_scan)
 	{
-		/* Walk right to the next page with data */
-		if (scan->parallel_scan != NULL)
+		/* Not parallel, so use nextPage/prevPage from local scan state */
+		if (ScanDirectionIsForward(dir))
 		{
-			/*
-			 * Seize the scan to get the next block number; if the scan has
-			 * ended already, bail out.
-			 */
-			status = _bt_parallel_seize(scan, &blkno, false);
-			if (!status)
-			{
-				/* release the previous buffer, if pinned */
-				BTScanPosUnpinIfPinned(so->currPos);
-				BTScanPosInvalidate(so->currPos);
-				return false;
-			}
+			blkno = so->currPos.nextPage;
+			/* Remember we left a page with data */
+			so->currPos.moreLeft = true;
 		}
 		else
 		{
-			/* Not parallel, so use the previously-saved nextPage link. */
-			blkno = so->currPos.nextPage;
+			blkno = so->currPos.prevPage;
+			/* Remember we left a page with data */
+			so->currPos.moreRight = true;
 		}
-
-		/* Remember we left a page with data */
-		so->currPos.moreLeft = true;
-
-		/* release the previous buffer, if pinned */
-		BTScanPosUnpinIfPinned(so->currPos);
+		lastcurrblkno = so->currPos.currPage;
 	}
 	else
 	{
-		/* Remember we left a page with data */
-		so->currPos.moreRight = true;
+		/*
+		 * Seize the scan to get the nextPage and currPage from shared
+		 * parallel state
+		 */
+		status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, false);
+		if (!status)
+		{
+			BTScanPosInvalidate(so->currPos);
+			return false;
+		}
 
-		if (scan->parallel_scan != NULL)
-		{
-			/*
-			 * Seize the scan to get the current block number; if the scan has
-			 * ended already, bail out.
-			 */
-			status = _bt_parallel_seize(scan, &blkno, false);
-			BTScanPosUnpinIfPinned(so->currPos);
-			if (!status)
-			{
-				BTScanPosInvalidate(so->currPos);
-				return false;
-			}
-		}
-		else
-		{
-			/* Not parallel, so just use our own notion of the current page */
-			blkno = so->currPos.currPage;
-		}
+		/* no point in remembering we left a page with data here */
 	}
 
-	if (!_bt_readnextpage(scan, blkno, dir))
+	if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir))
 		return false;
 
-	/* We have at least one item to return as scan's next item */
-	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	return true;
+}
 
+/*
+ *	_bt_readfirstpage() -- Read first page containing valid data for scan
+ *
+ * Caller passes us a page offset number that's the first tuple that'll be
+ * read by _bt_readpage.  This offset typically comes from _bt_binsrch.
+ *
+ * On entry, so->currPos.buf must be pinned and locked.  Also, parallel scan
+ * callers must have seized the scan before calling here.
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page, and we return true.  We hold a pin on the buffer on
+ * success exit, except for scans that _bt_drop_lock_and_maybe_pin indicates
+ * can safely have their pin dropped to avoid blocking progress by VACUUM.
+ *
+ * If there are no more matching records in the given direction, we set
+ * so->currPos.buf to InvalidBuffer, and return false.  Note that this could
+ * just signal the end of the current primitive index scan.
+ *
+ * We always release the scan for a parallel scan caller, regardless of
+ * success or failure; we do this by calling _bt_parallel_release at the
+ * earliest opportunity.
+ */
+static bool
+_bt_readfirstpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
+{
+	Relation	rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+	Assert(BufferIsValid(so->currPos.buf));
+	Assert(!P_IGNORE(BTPageGetOpaque(BufferGetPage(so->currPos.buf))));
+
+	PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf),
+					  scan->xs_snapshot);
+
+	if (_bt_readpage(scan, dir, offnum, true))
+	{
+		/* We have at least one item to return as scan's first item */
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+		return true;
+	}
+
+	/*
+	 * There's no actually-matching data on this page.  Try to advance to the
+	 * next page.
+	 */
+	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+	if (!_bt_steppage(scan, dir))
+		return false;
+
+	/*
+	 * We have at least one item to return as scan's first item (just not from
+	 * the first page it read).  No need to call _bt_drop_lock_and_maybe_pin
+	 * here; _bt_readnextpage already did that for us.
+	 */
 	return true;
 }
 
 /*
  *	_bt_readnextpage() -- Read next page containing valid data for scan
  *
- * On success exit, so->currPos is updated to contain data from the next
- * interesting page, and we return true.  Caller must release the lock (and
- * maybe the pin) on the buffer on success exit.
+ * caller's blkno is the next interesting page's link, taken from either the
+ * previously-saved right link or left link.  lastcurrblkno is the page that
+ * was current at the point where the blkno link was saved, which we somtimes
+ * use to reason about concurrent page splits/page deletions (this is only
+ * required by backwards scans, but we always have it, just to be consistent).
  *
- * If there are no more matching records in the given direction, we drop all
- * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
+ * On entry, so->currPos.buf must not be pinned or locked.  Also, parallel
+ * scan callers must have seized the scan before calling here.
+ *
+ * On success exit, so->currPos is updated to contain data from the next
+ * interesting page, and we return true.  We hold a pin on the buffer on
+ * success exit, except for scans that _bt_drop_lock_and_maybe_pin indicates
+ * can safely have their pin dropped to avoid blocking progress by VACUUM.
+ *
+ * If there are no more matching records in the given direction, we set
+ * so->currPos.buf to InvalidBuffer, and return false.  Note that this could
+ * just signal the end of the current primitive index scan.
+ *
+ * We always release the scan for a parallel scan caller, regardless of
+ * success or failure; we do this by calling _bt_parallel_release at the
+ * earliest opportunity.
  */
 static bool
-_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
+_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
+				 BlockNumber lastcurrblkno, ScanDirection dir)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	Relation	rel;
@@ -2186,6 +2236,8 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 
 	rel = scan->indexRelation;
 
+	Assert(!BTScanPosIsPinned(so->currPos));
+
 	if (ScanDirectionIsForward(dir))
 	{
 		for (;;)
@@ -2217,84 +2269,54 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 			}
 			else if (scan->parallel_scan != NULL)
 			{
-				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, opaque->btpo_next);
+				/* _bt_readpage not called, so do this for ourselves */
+				_bt_parallel_release(scan, opaque->btpo_next, blkno);
 			}
 
-			/* nope, keep going */
 			if (scan->parallel_scan != NULL)
 			{
+				/* no matching tuples, so seize a page to the right */
 				_bt_relbuf(rel, so->currPos.buf);
-				status = _bt_parallel_seize(scan, &blkno, false);
+				status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, false);
 				if (!status)
 				{
+					/* don't call _bt_parallel_done */
 					BTScanPosInvalidate(so->currPos);
 					return false;
 				}
 			}
 			else
 			{
+				/* no matching tuples, so consider following right link */
+				lastcurrblkno = blkno;
 				blkno = opaque->btpo_next;
 				_bt_relbuf(rel, so->currPos.buf);
+				so->currPos.buf = InvalidBuffer;
 			}
 		}
 	}
 	else
 	{
-		/*
-		 * Should only happen in parallel cases, when some other backend
-		 * advanced the scan.
-		 */
-		if (so->currPos.currPage != blkno)
-		{
-			BTScanPosUnpinIfPinned(so->currPos);
-			so->currPos.currPage = blkno;
-		}
-
-		/* Done if we know that the left sibling link isn't of interest */
-		if (!so->currPos.moreLeft)
-		{
-			BTScanPosUnpinIfPinned(so->currPos);
-			_bt_parallel_done(scan);
-			BTScanPosInvalidate(so->currPos);
-			return false;
-		}
-
-		/*
-		 * Walk left to the next page with data.  This is much more complex
-		 * than the walk-right case because of the possibility that the page
-		 * to our left splits while we are in flight to it, plus the
-		 * possibility that the page we were on gets deleted after we leave
-		 * it.  See nbtree/README for details.
-		 *
-		 * It might be possible to rearrange this code to have less overhead
-		 * in pinning and locking, but that would require capturing the left
-		 * sibling block number when the page is initially read, and then
-		 * optimistically starting there (rather than pinning the page twice).
-		 * It is not clear that this would be worth the complexity.
-		 */
-		if (BTScanPosIsPinned(so->currPos))
-			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
-		else
-			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
-
 		for (;;)
 		{
-			/* Done if we know that the left sibling link isn't of interest */
-			if (!so->currPos.moreLeft)
+			/*
+			 * if we're at end of scan, give up and mark parallel scan as
+			 * done, so that all the workers can finish their scan
+			 */
+			if (blkno == P_NONE || !so->currPos.moreLeft)
 			{
-				_bt_relbuf(rel, so->currPos.buf);
 				_bt_parallel_done(scan);
 				BTScanPosInvalidate(so->currPos);
 				return false;
 			}
 
-			/* Step to next physical page */
-			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
+			/* move left at least one page (checks for interrupts, too) */
+			so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
+														 lastcurrblkno);
 
-			/* if we're physically at end of index, return failure */
 			if (so->currPos.buf == InvalidBuffer)
 			{
+				/* must have been a concurrent deletion of leftmost page */
 				_bt_parallel_done(scan);
 				BTScanPosInvalidate(so->currPos);
 				return false;
@@ -2309,7 +2331,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 			opaque = BTPageGetOpaque(page);
 			if (!P_IGNORE(opaque))
 			{
-				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
+				PredicateLockPage(rel, blkno, scan->xs_snapshot);
 				/* see if there are any matches on this page */
 				/* note that this will clear moreLeft if we can stop */
 				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
@@ -2317,51 +2339,34 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 			}
 			else if (scan->parallel_scan != NULL)
 			{
-				/* allow next page be processed by parallel worker */
-				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+				/* _bt_readpage not called, so do this for ourselves */
+				Assert(P_ISHALFDEAD(opaque));
+				_bt_parallel_release(scan, opaque->btpo_prev, blkno);
 			}
 
-			/*
-			 * For parallel scans, get the last page scanned as it is quite
-			 * possible that by the time we try to seize the scan, some other
-			 * worker has already advanced the scan to a different page.  We
-			 * must continue based on the latest page scanned by any worker.
-			 */
 			if (scan->parallel_scan != NULL)
 			{
+				/* no matching tuples, so seize a page to the left */
 				_bt_relbuf(rel, so->currPos.buf);
-				status = _bt_parallel_seize(scan, &blkno, false);
+				status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, false);
 				if (!status)
 				{
+					/* don't call _bt_parallel_done */
 					BTScanPosInvalidate(so->currPos);
 					return false;
 				}
-				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
+			}
+			else
+			{
+				/* no matching tuples, so consider following left link */
+				lastcurrblkno = blkno;
+				blkno = opaque->btpo_prev;
+				_bt_relbuf(rel, so->currPos.buf);
+				so->currPos.buf = InvalidBuffer;
 			}
 		}
 	}
 
-	return true;
-}
-
-/*
- *	_bt_parallel_readpage() -- Read current page containing valid data for scan
- *
- * On success, release lock and maybe pin on buffer.  We return true to
- * indicate success.
- */
-static bool
-_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
-{
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-
-	Assert(!so->needPrimScan);
-
-	_bt_initialize_more_data(so, dir);
-
-	if (!_bt_readnextpage(scan, blkno, dir))
-		return false;
-
 	/* We have at least one item to return as scan's next item */
 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
 
@@ -2369,48 +2374,41 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 }
 
 /*
- * _bt_walk_left() -- step left one page, if possible
+ * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
+ * recovering from concurrent page splits/page deletions when necessary
  *
- * The given buffer must be pinned and read-locked.  This will be dropped
- * before stepping left.  On return, we have pin and read lock on the
- * returned page, instead.
+ * Called during backwards scans, to deal with their unique concurrency rules.
  *
- * Returns InvalidBuffer if there is no page to the left (no lock is held
- * in that case).
+ * blkno points to the block number of the page that we expect to move the
+ * scan to.  We'll successfully move the scan there when we find that its
+ * right sibling link points to lastcurrblkno, the page that we just finished
+ * reading.  Otherwise, we have to figure out which page is the correct one to
+ * visit next the hard way, which involves reasoning about both concurrent
+ * page splits and concurrent page deletions.  See nbtree/README for details.
+ *
+ * On return, we have both a pin and a read lock on the returned page, whose
+ * block number will be set in *blkno.  Returns InvalidBuffer if there is no
+ * page to the left (no lock is held in that case).
  *
  * It is possible for the returned leaf page to be half-dead; caller must
  * check that condition and step left again when required.
  */
 static Buffer
-_bt_walk_left(Relation rel, Buffer buf)
+_bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
+						   BlockNumber lastcurrblkno)
 {
-	Page		page;
-	BTPageOpaque opaque;
-
-	page = BufferGetPage(buf);
-	opaque = BTPageGetOpaque(page);
+	BlockNumber origblkno = *blkno; /* detects circular links */
 
 	for (;;)
 	{
-		BlockNumber obknum;
-		BlockNumber lblkno;
-		BlockNumber blkno;
+		Buffer		buf;
+		Page		page;
+		BTPageOpaque opaque;
 		int			tries;
 
-		/* if we're at end of tree, release buf and return failure */
-		if (P_LEFTMOST(opaque))
-		{
-			_bt_relbuf(rel, buf);
-			break;
-		}
-		/* remember original page we are stepping left from */
-		obknum = BufferGetBlockNumber(buf);
-		/* step left */
-		blkno = lblkno = opaque->btpo_prev;
-		_bt_relbuf(rel, buf);
 		/* check for interrupts while we're not holding any buffer lock */
 		CHECK_FOR_INTERRUPTS();
-		buf = _bt_getbuf(rel, blkno, BT_READ);
+		buf = _bt_getbuf(rel, *blkno, BT_READ);
 		page = BufferGetPage(buf);
 		opaque = BTPageGetOpaque(page);
 
@@ -2427,21 +2425,26 @@ _bt_walk_left(Relation rel, Buffer buf)
 		tries = 0;
 		for (;;)
 		{
-			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+			if (!P_ISDELETED(opaque) && opaque->btpo_next == lastcurrblkno)
 			{
 				/* Found desired page, return it */
 				return buf;
 			}
 			if (P_RIGHTMOST(opaque) || ++tries > 4)
 				break;
-			blkno = opaque->btpo_next;
-			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+			/* step right */
+			*blkno = opaque->btpo_next;
+			buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
 			page = BufferGetPage(buf);
 			opaque = BTPageGetOpaque(page);
 		}
 
-		/* Return to the original page to see what's up */
-		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
+		/*
+		 * Return to the original page (usually the page most recently read by
+		 * _bt_readpage, which is passed by caller as lastcurrblkno) to see
+		 * what's up with sibling links
+		 */
+		buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
 		page = BufferGetPage(buf);
 		opaque = BTPageGetOpaque(page);
 		if (P_ISDELETED(opaque))
@@ -2457,8 +2460,8 @@ _bt_walk_left(Relation rel, Buffer buf)
 				if (P_RIGHTMOST(opaque))
 					elog(ERROR, "fell off the end of index \"%s\"",
 						 RelationGetRelationName(rel));
-				blkno = opaque->btpo_next;
-				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
+				*blkno = opaque->btpo_next;
+				buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
 				page = BufferGetPage(buf);
 				opaque = BTPageGetOpaque(page);
 				if (!P_ISDELETED(opaque))
@@ -2466,9 +2469,10 @@ _bt_walk_left(Relation rel, Buffer buf)
 			}
 
 			/*
-			 * Now return to top of loop, resetting obknum to point to this
-			 * nondeleted page, and try again.
+			 * remember that this page is actually now the leftmost one whose
+			 * key space we've already read all tuples from
 			 */
+			lastcurrblkno = BufferGetBlockNumber(buf);
 		}
 		else
 		{
@@ -2477,11 +2481,22 @@ _bt_walk_left(Relation rel, Buffer buf)
 			 * to the left got split or deleted. Without this check, we'd go
 			 * into an infinite loop if there's anything wrong.
 			 */
-			if (opaque->btpo_prev == lblkno)
+			if (opaque->btpo_prev == origblkno)
 				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
-					 obknum, RelationGetRelationName(rel));
-			/* Okay to try again with new lblkno value */
+					 lastcurrblkno, RelationGetRelationName(rel));
+			/* Okay to try again, since left sibling link changed */
 		}
+
+		if (P_LEFTMOST(opaque))
+		{
+			/* previous leftmost page must have been concurrently deleted */
+			_bt_relbuf(rel, buf);
+			break;
+		}
+
+		/* next loop iteration will start over with a clean slate */
+		*blkno = origblkno = opaque->btpo_prev;
+		_bt_relbuf(rel, buf);
 	}
 
 	return InvalidBuffer;
@@ -2605,7 +2620,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		return false;
 	}
 
-	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
 	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
 	Assert(P_ISLEAF(opaque));
@@ -2637,21 +2651,8 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	/*
 	 * Now load data from the first page of the scan.
 	 */
-	if (!_bt_readpage(scan, dir, start, true))
-	{
-		/*
-		 * There's no actually-matching data on this page.  Try to advance to
-		 * the next page.  Return false if there's no matching data at all.
-		 */
-		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
-		if (!_bt_steppage(scan, dir))
-			return false;
-	}
-	else
-	{
-		/* We have at least one item to return as scan's first item */
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
-	}
+	if (!_bt_readfirstpage(scan, dir, start))
+		return false;
 
 	/* OK, itemIndex says what to return */
 	currItem = &so->currPos.items[so->currPos.itemIndex];
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index b4ba51357..03d5c671f 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2442,7 +2442,7 @@ new_prim_scan:
 	so->needPrimScan = true;	/* ...but call _bt_first again */
 
 	if (scan->parallel_scan)
-		_bt_parallel_primscan_schedule(scan, pstate->prev_scan_page);
+		_bt_parallel_primscan_schedule(scan, so->currPos.currPage);
 
 	/* Caller's tuple doesn't match the new qual */
 	return false;
-- 
2.45.2

