From fa6d50a9fea6be46bd69865760d5949ab4bf1f2f Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 27 Apr 2018 12:47:39 -0700
Subject: [PATCH v2] Ensure nbtree leaf tuple keys are always unique.

Make comparisons of nbtree index tuples consider heap TID as a
tie-breaker attribute.  Add a separate heap TID attribute to pivot
tuples to make heap TID a first class part of the key space on all
levels of the tree.  The heap TID attribute is sorted in DESC order,
which makes space usage across leaf pages of duplicates have
approximately the same space usage characteristics as before.

Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so this patch also adds suffix
truncation of pivot tuples.  This will usually truncate away the "extra"
heap TID attribute from pivot tuples during a leaf page split, and may
also truncate away additional user attributes.  This can increase
fan-out when there are multiple indexed attributes, though that is only
a secondary goal.

This is a proof of concept patch, which is probably only useful as part
of some much larger effort to add cheap retail index tuple deletion.  It
has several significant unresolved issues, including:

* It fails to deal with on-disk compatibility/pg_upgrade.  It also
slightly reduces the maximum amount of space usable for an index tuple,
in order to reserve room for a possible heap TID in a pivot tuple.
(This reduction in the maximum tuple size may ultimately be deemed
acceptable, and in any case seems impossible to avoid.)

* It regresses performance with some workloads to an extent that's not
acceptable.  This may be improved in a future version.
---
 contrib/amcheck/verify_nbtree.c       | 219 +++++++++++++++++++++++++---------
 src/backend/access/nbtree/README      |  70 ++++++-----
 src/backend/access/nbtree/nbtinsert.c | 133 +++++++++++++--------
 src/backend/access/nbtree/nbtpage.c   |   6 +-
 src/backend/access/nbtree/nbtsearch.c |  98 +++++++++++++--
 src/backend/access/nbtree/nbtsort.c   |  55 ++++++---
 src/backend/access/nbtree/nbtutils.c  | 155 +++++++++++++++++++-----
 src/backend/access/nbtree/nbtxlog.c   |   3 +
 src/backend/utils/sort/tuplesort.c    |  13 +-
 src/include/access/nbtree.h           |  71 ++++++++---
 src/test/regress/expected/join.out    |   2 +-
 11 files changed, 611 insertions(+), 214 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index a1438a2855..2358bfa94d 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -45,6 +45,13 @@ PG_MODULE_MAGIC;
  */
 #define InvalidBtreeLevel	((uint32) InvalidBlockNumber)
 
+/*
+ * Convenience macro to get number of key attributes in tuple in low-context
+ * fashion
+ */
+#define BTreeTupleGetNKeyAtts(itup, rel)   \
+	Min(IndexRelationGetNumberOfKeyAttributes(rel), BTreeTupleGetNAtts(itup, rel))
+
 /*
  * State associated with verifying a B-Tree index
  *
@@ -125,25 +132,27 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
 							 BtreeLevel level);
 static void bt_target_page_check(BtreeCheckState *state);
-static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
+static IndexTuple bt_right_page_check_tuple(BtreeCheckState *state);
 static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
-				  ScanKey targetkey);
+				  ScanKey targetkey, ItemPointer scantid, int tupnkeyatts);
 static void bt_downlink_missing_check(BtreeCheckState *state);
 static void bt_tuple_present_callback(Relation index, HeapTuple htup,
 						  Datum *values, bool *isnull,
 						  bool tupleIsAlive, void *checkstate);
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 							OffsetNumber offset);
+static inline bool invariant_l_offset(BtreeCheckState *state,
+					 int tupnkeyatts, ScanKey key, ItemPointer scantid,
+					 OffsetNumber upperbound);
 static inline bool invariant_leq_offset(BtreeCheckState *state,
-					 ScanKey key,
+					 int tupnkeyatts, ScanKey key, ItemPointer scantid,
 					 OffsetNumber upperbound);
-static inline bool invariant_geq_offset(BtreeCheckState *state,
-					 ScanKey key,
+static inline bool invariant_g_offset(BtreeCheckState *state,
+					 int tupnkeyatts, ScanKey key, ItemPointer scantid,
 					 OffsetNumber lowerbound);
-static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,
-							   Page other,
-							   ScanKey key,
-							   OffsetNumber upperbound);
+static inline bool invariant_l_nontarget_offset(BtreeCheckState *state,
+							   Page other, int tupnkeyatts, ScanKey key,
+							   ItemPointer scantid, OffsetNumber upperbound);
 static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);
 
 /*
@@ -834,8 +843,10 @@ bt_target_page_check(BtreeCheckState *state)
 	{
 		ItemId		itemid;
 		IndexTuple	itup;
-		ScanKey		skey;
 		size_t		tupsize;
+		int			tupnkeyatts;
+		ScanKey		skey;
+		ItemPointer scantid;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -902,8 +913,17 @@ bt_target_page_check(BtreeCheckState *state)
 		if (offset_is_negative_infinity(topaque, offset))
 			continue;
 
-		/* Build insertion scankey for current page offset */
+		/*
+		 * Build insertion scankey for current page offset/tuple.
+		 *
+		 * As required by _bt_mkscankey(), track number of key attributes,
+		 * which is needed so that _bt_compare() calls handle truncated
+		 * attributes correctly.  Never count non-key attributes in
+		 * non-truncated tuples as key attributes, though.
+		 */
+		tupnkeyatts = BTreeTupleGetNKeyAtts(itup, state->rel);
 		skey = _bt_mkscankey(state->rel, itup);
+		scantid = BTreeTupleGetHeapTID(itup);
 
 		/* Fingerprint leaf page tuples (those that point to the heap) */
 		if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
@@ -930,7 +950,7 @@ bt_target_page_check(BtreeCheckState *state)
 		 * and probably not markedly more effective in practice.
 		 */
 		if (!P_RIGHTMOST(topaque) &&
-			!invariant_leq_offset(state, skey, P_HIKEY))
+			!invariant_leq_offset(state, tupnkeyatts, skey, scantid, P_HIKEY))
 		{
 			char	   *itid,
 					   *htid;
@@ -956,11 +976,11 @@ bt_target_page_check(BtreeCheckState *state)
 		 * * Item order check *
 		 *
 		 * Check that items are stored on page in logical order, by checking
-		 * current item is less than or equal to next item (if any).
+		 * current item is strictly less than next item (if any).
 		 */
 		if (OffsetNumberNext(offset) <= max &&
-			!invariant_leq_offset(state, skey,
-								  OffsetNumberNext(offset)))
+			!invariant_l_offset(state, tupnkeyatts, skey, scantid,
+								OffsetNumberNext(offset)))
 		{
 			char	   *itid,
 					   *htid,
@@ -1017,16 +1037,28 @@ bt_target_page_check(BtreeCheckState *state)
 		 */
 		else if (offset == max)
 		{
+			IndexTuple	righttup;
 			ScanKey		rightkey;
+			int			righttupnkeyatts;
+			ItemPointer rightscantid;
 
 			/* Get item in next/right page */
-			rightkey = bt_right_page_check_scankey(state);
+			righttup = bt_right_page_check_tuple(state);
 
-			if (rightkey &&
-				!invariant_geq_offset(state, rightkey, max))
+			/* Set up right item scankey */
+			if (righttup)
+			{
+				righttupnkeyatts = BTreeTupleGetNKeyAtts(righttup, state->rel);
+				rightkey = _bt_mkscankey(state->rel, righttup);
+				rightscantid = BTreeTupleGetHeapTID(righttup);
+			}
+
+			if (righttup &&
+				!invariant_g_offset(state, righttupnkeyatts, rightkey,
+									rightscantid, max))
 			{
 				/*
-				 * As explained at length in bt_right_page_check_scankey(),
+				 * As explained at length in bt_right_page_check_tuple(),
 				 * there is a known !readonly race that could account for
 				 * apparent violation of invariant, which we must check for
 				 * before actually proceeding with raising error.  Our canary
@@ -1069,7 +1101,7 @@ bt_target_page_check(BtreeCheckState *state)
 		{
 			BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
 
-			bt_downlink_check(state, childblock, skey);
+			bt_downlink_check(state, childblock, skey, scantid, tupnkeyatts);
 		}
 	}
 
@@ -1083,9 +1115,9 @@ bt_target_page_check(BtreeCheckState *state)
 }
 
 /*
- * Return a scankey for an item on page to right of current target (or the
+ * Return an index tuple for an item on page to right of current target (or the
  * first non-ignorable page), sufficient to check ordering invariant on last
- * item in current target page.  Returned scankey relies on local memory
+ * item in current target page.  Returned tuple relies on local memory
  * allocated for the child page, which caller cannot pfree().  Caller's memory
  * context should be reset between calls here.
  *
@@ -1098,8 +1130,8 @@ bt_target_page_check(BtreeCheckState *state)
  * Note that !readonly callers must reverify that target page has not
  * been concurrently deleted.
  */
-static ScanKey
-bt_right_page_check_scankey(BtreeCheckState *state)
+static IndexTuple
+bt_right_page_check_tuple(BtreeCheckState *state)
 {
 	BTPageOpaque opaque;
 	ItemId		rightitem;
@@ -1287,11 +1319,10 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 	}
 
 	/*
-	 * Return first real item scankey.  Note that this relies on right page
-	 * memory remaining allocated.
+	 * Return first real item.  Note that this relies on right page memory
+	 * remaining allocated.
 	 */
-	return _bt_mkscankey(state->rel,
-						 (IndexTuple) PageGetItem(rightpage, rightitem));
+	return (IndexTuple) PageGetItem(rightpage, rightitem);
 }
 
 /*
@@ -1305,7 +1336,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
  */
 static void
 bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
-				  ScanKey targetkey)
+				  ScanKey targetkey, ItemPointer scantid, int tupnkeyatts)
 {
 	OffsetNumber offset;
 	OffsetNumber maxoffset;
@@ -1354,7 +1385,7 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
 
 	/*
 	 * Verify child page has the downlink key from target page (its parent) as
-	 * a lower bound.
+	 * a lower bound; downlink must be strictly less than all keys on the page.
 	 *
 	 * Check all items, rather than checking just the first and trusting that
 	 * the operator class obeys the transitive law.
@@ -1404,14 +1435,14 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
 		/*
 		 * Skip comparison of target page key against "negative infinity"
 		 * item, if any.  Checking it would indicate that it's not an upper
-		 * bound, but that's only because of the hard-coding within
-		 * _bt_compare().
+		 * bound, but that's only because of the hard-coding for negative
+		 * inifinity items within _bt_compare().
 		 */
 		if (offset_is_negative_infinity(copaque, offset))
 			continue;
 
-		if (!invariant_leq_nontarget_offset(state, child,
-											targetkey, offset))
+		if (!invariant_l_nontarget_offset(state, child, tupnkeyatts, targetkey,
+										  scantid, offset))
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("down-link lower bound invariant violated for index \"%s\"",
@@ -1751,6 +1782,51 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
 	return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);
 }
 
+/*
+ * Does the invariant hold that the key is strictly less than a given upper
+ * bound offset item?
+ *
+ * If this function returns false, convention is that caller throws error due
+ * to corruption.
+ */
+static inline bool
+invariant_l_offset(BtreeCheckState *state, int tupnkeyatts, ScanKey key,
+				   ItemPointer scantid, OffsetNumber upperbound)
+{
+	int32		cmp;
+
+	cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, state->target,
+					  upperbound);
+
+	/*
+	 * _bt_compare interprets the absence of attributes in scan keys as meaning
+	 * that they're not participating in a search, not as negative infinity
+	 * (only tuples within the index are treated as negative infinity).
+	 * Compensate for that here.
+	 */
+	if (cmp == 0)
+	{
+		ItemId		itemid;
+		IndexTuple	ritup;
+		int			uppnkeyatts;
+		ItemPointer rheaptid;
+
+		itemid = PageGetItemId(state->target, upperbound);
+		ritup = (IndexTuple) PageGetItem(state->target, itemid);
+		uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel);
+
+		/* Get heap TID for item to the right */
+		rheaptid = BTreeTupleGetHeapTID(ritup);
+
+		if (uppnkeyatts == tupnkeyatts)
+			return scantid == NULL && rheaptid != NULL;
+
+		return tupnkeyatts < uppnkeyatts;
+	}
+
+	return cmp < 0;
+}
+
 /*
  * Does the invariant hold that the key is less than or equal to a given upper
  * bound offset item?
@@ -1759,57 +1835,90 @@ offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
  * to corruption.
  */
 static inline bool
-invariant_leq_offset(BtreeCheckState *state, ScanKey key,
-					 OffsetNumber upperbound)
+invariant_leq_offset(BtreeCheckState *state, int tupnkeyatts, ScanKey key,
+					 ItemPointer scantid, OffsetNumber upperbound)
 {
-	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 	int32		cmp;
 
-	cmp = _bt_compare(state->rel, nkeyatts, key, state->target, upperbound);
+	cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, state->target,
+					  upperbound);
 
 	return cmp <= 0;
 }
 
 /*
- * Does the invariant hold that the key is greater than or equal to a given
- * lower bound offset item?
+ * Does the invariant hold that the key is strictly greater than a given lower
+ * bound offset item?
  *
  * If this function returns false, convention is that caller throws error due
  * to corruption.
  */
 static inline bool
-invariant_geq_offset(BtreeCheckState *state, ScanKey key,
-					 OffsetNumber lowerbound)
+invariant_g_offset(BtreeCheckState *state, int tupnkeyatts, ScanKey key,
+				   ItemPointer scantid, OffsetNumber lowerbound)
 {
-	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 	int32		cmp;
 
-	cmp = _bt_compare(state->rel, nkeyatts, key, state->target, lowerbound);
+	/*
+	 * No need to consider possibility that scankey has attributes that we need
+	 * to force to be interpreted as negative infinity, since scan key has to
+	 * be strictly greater than lower bound offset.
+	 */
+	cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, state->target,
+					  lowerbound);
 
-	return cmp >= 0;
+	return cmp > 0;
 }
 
 /*
- * Does the invariant hold that the key is less than or equal to a given upper
+ * Does the invariant hold that the key is strictly less than a given upper
  * bound offset item, with the offset relating to a caller-supplied page that
- * is not the current target page? Caller's non-target page is typically a
- * child page of the target, checked as part of checking a property of the
- * target page (i.e. the key comes from the target).
+ * is not the current target page?
+ *
+ * Caller's non-target page is a child page of the target, checked as part of
+ * checking a property of the target page (i.e.  the key comes from the
+ * target).
  *
  * If this function returns false, convention is that caller throws error due
  * to corruption.
  */
 static inline bool
-invariant_leq_nontarget_offset(BtreeCheckState *state,
-							   Page nontarget, ScanKey key,
-							   OffsetNumber upperbound)
+invariant_l_nontarget_offset(BtreeCheckState *state, Page nontarget,
+							 int tupnkeyatts, ScanKey key, ItemPointer scantid,
+							 OffsetNumber upperbound)
 {
-	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
 	int32		cmp;
 
-	cmp = _bt_compare(state->rel, nkeyatts, key, nontarget, upperbound);
+	cmp = _bt_compare(state->rel, tupnkeyatts, key, scantid, nontarget,
+					  upperbound);
 
-	return cmp <= 0;
+	/*
+	 * _bt_compare interprets the absence of attributes in scan keys as meaning
+	 * that they're not participating in a search, not as negative infinity
+	 * (only tuples within the index are treated as negative infinity).
+	 * Compensate for that here.
+	 */
+	if (cmp == 0)
+	{
+		ItemId		itemid;
+		IndexTuple	child;
+		int			uppnkeyatts;
+		ItemPointer childheaptid;
+
+		itemid = PageGetItemId(nontarget, upperbound);
+		child = (IndexTuple) PageGetItem(nontarget, itemid);
+		uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel);
+
+		/* Get heap TID for item from child/non-target */
+		childheaptid = BTreeTupleGetHeapTID(child);
+
+		if (uppnkeyatts == tupnkeyatts)
+			return scantid == NULL && childheaptid != NULL;
+
+		return tupnkeyatts < uppnkeyatts;
+	}
+
+	return cmp < 0;
 }
 
 /*
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 3680e69b89..0782f0129c 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -46,18 +46,15 @@ the real "key" at all, just at the link field.)  We can distinguish
 items at the leaf level in the same way, by examining their links to
 heap tuples; we'd never have two items for the same heap tuple.
 
-Lehman and Yao assume that the key range for a subtree S is described
+Lehman and Yao require that the key range for a subtree S is described
 by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
-page.  This does not work for nonunique keys (for example, if we have
-enough equal keys to spread across several leaf pages, there *must* be
-some equal bounding keys in the first level up).  Therefore we assume
-Ki <= v <= Ki+1 instead.  A search that finds exact equality to a
-bounding key in an upper tree level must descend to the left of that
-key to ensure it finds any equal keys in the preceding page.  An
-insertion that sees the high key of its target page is equal to the key
-to be inserted has a choice whether or not to move right, since the new
-key could go on either page.  (Currently, we try to find a page where
-there is room for the new key without a split.)
+page.  A search that finds exact equality to a bounding key in an upper
+tree level must descend to the left of that key to ensure it finds any
+equal keys.  An insertion that sees the high key of its target page is
+equal to the key to be inserted cannot move right, since the downlink
+for the right sibling in the parent must always be strictly less than
+right sibling keys (this is always possible because the leftmost
+downlink on any non-leaf level is always a negative infinity downlink).
 
 Lehman and Yao don't require read locks, but assume that in-memory
 copies of tree pages are unshared.  Postgres shares in-memory buffers
@@ -610,21 +607,25 @@ scanned to decide whether to return the entry and whether the scan can
 stop (see _bt_checkkeys()).
 
 We use term "pivot" index tuples to distinguish tuples which don't point
-to heap tuples, but rather used for tree navigation.  Pivot tuples includes
-all tuples on non-leaf pages and high keys on leaf pages.  Note that pivot
-index tuples are only used to represent which part of the key space belongs
-on each page, and can have attribute values copied from non-pivot tuples
-that were deleted and killed by VACUUM some time ago.  In principle, we could
-truncate away attributes that are not needed for a page high key during a leaf
-page split, provided that the remaining attributes distinguish the last index
-tuple on the post-split left page as belonging on the left page, and the first
-index tuple on the post-split right page as belonging on the right page.  This
-optimization is sometimes called suffix truncation, and may appear in a future
-release. Since the high key is subsequently reused as the downlink in the
-parent page for the new right page, suffix truncation can increase index
-fan-out considerably by keeping pivot tuples short.  INCLUDE indexes similarly
-truncate away non-key attributes at the time of a leaf page split,
-increasing fan-out.
+to heap tuples, that are used only for tree navigation.  Pivot tuples
+includes all tuples on non-leaf pages and high keys on leaf pages.  Note
+that pivot index tuples are only used to represent which part of the key
+space belongs on each page, and can have attribute values copied from
+non-pivot tuples that were deleted and killed by VACUUM some time ago.
+
+We truncate away attributes that are not needed for a page high key during
+a leaf page split, provided that the remaining attributes distinguish the
+last index tuple on the post-split left page as belonging on the left
+page, and the first index tuple on the post-split right page as belonging
+on the right page.  A truncated tuple logically retains the truncated
+suffix key attributes, which implicitly have "negative infinity" as their
+value.  This optimization is called suffix truncation.  Since the high key
+is subsequently reused as the downlink in the parent page for the new
+right page, suffix truncation can increase index fan-out considerably by
+keeping pivot tuples short.  INCLUDE indexes are guaranteed to have
+non-key attributes truncated at the time of a leaf page split, but may
+also have some key attributes truncated away, based on the usual criteria
+for key attributes.
 
 Notes About Data Representation
 -------------------------------
@@ -658,4 +659,19 @@ downlink.  The first data item on each such page has no lower bound
 routines must treat it accordingly.  The actual key stored in the
 item is irrelevant, and need not be stored at all.  This arrangement
 corresponds to the fact that an L&Y non-leaf page has one more pointer
-than key.
+than key.  Suffix truncation's negative infinity attributes behave in
+the same way.
+
+Non-leaf pages only truly need to truncate their first item to zero
+attributes at the leftmost level, since that truly is negative infinity.
+All other negative infinity items are only really negative infinity
+within the subtree that the page is at the root of (or is a leftmost
+page within).  We truncate away all attributes of the first item on
+non-leaf pages just the same, to save a little space.  If we ever
+avoided zero-truncating items on pages where that doesn't accurately
+represent the absolute separation of the keyspace, we'd be left with
+"low key" items on internal pages -- a key value that can be used as a
+lower bound on items on the page, much like the high key is an upper
+bound. (Actually, that would even be true of "true" negative infinity
+items.  One can think of rightmost pages as implicitly containing
+"positive infinity" high keys.)
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 907cce0724..4c4f7d8835 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -180,7 +180,7 @@ top:
 				!P_IGNORE(lpageop) &&
 				(PageGetFreeSpace(page) > itemsz) &&
 				PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
-				_bt_compare(rel, indnkeyatts, itup_scankey, page,
+				_bt_compare(rel, indnkeyatts, itup_scankey, &itup->t_tid, page,
 							P_FIRSTDATAKEY(lpageop)) > 0)
 			{
 				/*
@@ -216,9 +216,12 @@ top:
 
 	if (!fastpath)
 	{
+		ItemPointer scantid =
+			(checkUnique != UNIQUE_CHECK_NO ? NULL : &itup->t_tid);
+
 		/* find the first page containing this key */
-		stack = _bt_search(rel, indnkeyatts, itup_scankey, false, &buf, BT_WRITE,
-						   NULL);
+		stack = _bt_search(rel, indnkeyatts, itup_scankey, scantid, false,
+						   &buf, BT_WRITE, NULL);
 
 		/* trade in our read lock for a write lock */
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
@@ -231,8 +234,8 @@ top:
 		 * need to move right in the tree.  See Lehman and Yao for an
 		 * excruciatingly precise description.
 		 */
-		buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, false,
-							true, stack, BT_WRITE, NULL);
+		buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, scantid,
+							false, true, stack, BT_WRITE, NULL);
 	}
 
 	/*
@@ -261,7 +264,8 @@ top:
 		TransactionId xwait;
 		uint32		speculativeToken;
 
-		offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, false);
+		/* Find position while excluding heap TID attribute */
+		offset = _bt_binsrch(rel, buf, indnkeyatts, itup_scankey, NULL, false);
 		xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
 								 checkUnique, &is_unique, &speculativeToken);
 
@@ -285,6 +289,25 @@ top:
 				_bt_freestack(stack);
 			goto top;
 		}
+
+		/*
+		 * Be careful to not get confused about user attribute position and
+		 * insertion position.
+		 *
+		 * XXX: This is ugly as sin, and clearly needs a lot more work.  While
+		 * not having this code does not seem to affect regression tests, we
+		 * almost certainly need to do something here for the case where
+		 * _bt_check_unique() traverses many pages, each filled with logical
+		 * duplicates.
+		 */
+		buf = _bt_moveright(rel, buf, indnkeyatts, itup_scankey, &itup->t_tid,
+							false, true, stack, BT_WRITE, NULL);
+		/*
+		 * Always invalidate hint
+		 *
+		 * FIXME: This is unacceptable.
+		 */
+		offset = InvalidOffsetNumber;
 	}
 
 	if (checkUnique != UNIQUE_CHECK_EXISTING)
@@ -564,11 +587,11 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 			offset = OffsetNumberNext(offset);
 		else
 		{
-			/* If scankey == hikey we gotta check the next page too */
+			/* If scankey <= hikey we gotta check the next page too */
 			if (P_RIGHTMOST(opaque))
 				break;
-			if (!_bt_isequal(itupdesc, page, P_HIKEY,
-							 indnkeyatts, itup_scankey))
+			/* _bt_isequal()'s special NULL semantics not required here */
+			if (_bt_compare(rel, indnkeyatts, itup_scankey, NULL, page, P_HIKEY) > 0)
 				break;
 			/* Advance to next non-dead page --- there must be one */
 			for (;;)
@@ -700,6 +723,18 @@ _bt_findinsertloc(Relation rel,
 	 * pages).  Currently the probability of moving right is set at 0.99,
 	 * which may seem too high to change the behavior much, but it does an
 	 * excellent job of preventing O(N^2) behavior with many equal keys.
+	 *
+	 * TODO: Support this old approach for pre-pg_upgrade indexes.
+	 *
+	 * None of this applies when all items in the tree are unique, since the
+	 * new item cannot go on either page if it's equal to the high key.  The
+	 * original L&Y invariant that we now follow is that high keys must be
+	 * less than or equal to all items on the page, and strictly less than
+	 * the right sibling items (since the high key also becomes the downlink
+	 * to the right sibling in parent after a page split).  It's very
+	 * unlikely that it will be equal anyway, since there will be explicit
+	 * heap TIDs in pivot tuples in the event of many duplicates, but it can
+	 * happen when heap TID recycling takes place.
 	 *----------
 	 */
 	movedright = false;
@@ -731,8 +766,7 @@ _bt_findinsertloc(Relation rel,
 		 * nope, so check conditions (b) and (c) enumerated above
 		 */
 		if (P_RIGHTMOST(lpageop) ||
-			_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
-			random() <= (MAX_RANDOM_VALUE / 100))
+			_bt_compare(rel, keysz, scankey, &newtup->t_tid, page, P_HIKEY) <= 0)
 			break;
 
 		/*
@@ -792,7 +826,7 @@ _bt_findinsertloc(Relation rel,
 	else if (firstlegaloff != InvalidOffsetNumber && !vacuumed)
 		newitemoff = firstlegaloff;
 	else
-		newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
+		newitemoff = _bt_binsrch(rel, buf, keysz, scankey, &newtup->t_tid, false);
 
 	*bufptr = buf;
 	*offsetptr = newitemoff;
@@ -851,11 +885,12 @@ _bt_insertonpg(Relation rel,
 	/* child buffer must be given iff inserting on an internal page */
 	Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
 	/* tuple must have appropriate number of attributes */
+	Assert(BTreeTupleGetNAtts(itup, rel) > 0);
 	Assert(!P_ISLEAF(lpageop) ||
 		   BTreeTupleGetNAtts(itup, rel) ==
 		   IndexRelationGetNumberOfAttributes(rel));
 	Assert(P_ISLEAF(lpageop) ||
-		   BTreeTupleGetNAtts(itup, rel) ==
+		   BTreeTupleGetNAtts(itup, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
 
 	/* The caller should've finished any incomplete splits already. */
@@ -1143,8 +1178,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 	OffsetNumber i;
 	bool		isleaf;
 	IndexTuple	lefthikey;
-	int			indnatts = IndexRelationGetNumberOfAttributes(rel);
-	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 
 	/* Acquire a new page to split into */
 	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
@@ -1214,7 +1247,9 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		itemid = PageGetItemId(origpage, P_HIKEY);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
-		Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
+		Assert(BTreeTupleGetNAtts(item, rel) > 0);
+		Assert(BTreeTupleGetNAtts(item, rel) <=
+			   IndexRelationGetNumberOfKeyAttributes(rel));
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
 						false, false) == InvalidOffsetNumber)
 		{
@@ -1247,25 +1282,35 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 	}
 
 	/*
-	 * Truncate non-key (INCLUDE) attributes of the high key item before
-	 * inserting it on the left page.  This only needs to happen at the leaf
-	 * level, since in general all pivot tuple values originate from leaf
-	 * level high keys.  This isn't just about avoiding unnecessary work,
-	 * though; truncating unneeded key attributes (more aggressive suffix
-	 * truncation) can only be performed at the leaf level anyway.  This is
-	 * because a pivot tuple in a grandparent page must guide a search not
-	 * only to the correct parent page, but also to the correct leaf page.
+	 * Truncate attributes of the high key item before inserting it on the left
+	 * page.  This can only happen at the leaf level, since in general all
+	 * pivot tuple values originate from leaf level high keys.  This isn't just
+	 * about avoiding unnecessary work, though; truncating unneeded key suffix
+	 * attributes can only be performed at the leaf level anyway.  This is
+	 * because a pivot tuple in a grandparent page must guide a search not only
+	 * to the correct parent page, but also to the correct leaf page.
+	 *
+	 * Note that non-key (INCLUDE) attributes are always truncated away here.
+	 * Additional key attributes are truncated away when they're not required
+	 * to correctly separate the key space.
+	 *
+	 * TODO: Give a little weight to how large the final downlink will be when
+	 * deciding on a split point.
 	 */
-	if (indnatts != indnkeyatts && isleaf)
+	if (isleaf)
 	{
-		lefthikey = _bt_nonkey_truncate(rel, item);
+		OffsetNumber	lastleftoffnum = OffsetNumberPrev(firstright);
+
+		lefthikey = _bt_suffix_truncate(rel, origpage, lastleftoffnum , item);
 		itemsz = IndexTupleSize(lefthikey);
 		itemsz = MAXALIGN(itemsz);
 	}
 	else
 		lefthikey = item;
 
-	Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
+	Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
+	Assert(BTreeTupleGetNAtts(lefthikey, rel) <=
+		   IndexRelationGetNumberOfKeyAttributes(rel));
 	if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
 					false, false) == InvalidOffsetNumber)
 	{
@@ -1487,22 +1532,11 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		if (newitemonleft)
 			XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
 
-		/* Log left page */
-		if (!isleaf || indnatts != indnkeyatts)
-		{
-			/*
-			 * We must also log the left page's high key.  There are two
-			 * reasons for that: right page's leftmost key is suppressed on
-			 * non-leaf levels and in covering indexes included columns are
-			 * truncated from high keys.  Show it as belonging to the left
-			 * page buffer, so that it is not stored if XLogInsert decides it
-			 * needs a full-page image of the left page.
-			 */
-			itemid = PageGetItemId(origpage, P_HIKEY);
-			item = (IndexTuple) PageGetItem(origpage, itemid);
-			XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
-			loglhikey = true;
-		}
+		/* Log left page.  We must also log the left page's high key. */
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+		XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
+		loglhikey = true;
 
 		/*
 		 * Log the contents of the right page in the format understood by
@@ -2210,7 +2244,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	/*
 	 * insert the right page pointer into the new root page.
 	 */
-	Assert(BTreeTupleGetNAtts(right_item, rel) ==
+	Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
+	Assert(BTreeTupleGetNAtts(right_item, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
 	if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
 					false, false) == InvalidOffsetNumber)
@@ -2322,8 +2357,8 @@ _bt_pgaddtup(Page page,
 /*
  * _bt_isequal - used in _bt_doinsert in check for duplicates.
  *
- * This is very similar to _bt_compare, except for NULL handling.
- * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
+ * This is very similar to _bt_compare, except for NULL and negative infinity
+ * handling.  Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
  */
 static bool
 _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
@@ -2337,12 +2372,6 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 
-	/*
-	 * It's okay that we might perform a comparison against a truncated page
-	 * high key when caller needs to determine if _bt_check_unique scan must
-	 * continue on to the next page.  Caller never asks us to compare non-key
-	 * attributes within an INCLUDE index.
-	 */
 	for (i = 1; i <= keysz; i++)
 	{
 		AttrNumber	attno;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index a24e64156a..25b24b1d66 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1415,8 +1415,10 @@ _bt_pagedel(Relation rel, Buffer buf)
 				itup_scankey = _bt_mkscankey(rel, targetkey);
 				/* find the leftmost leaf page containing this key */
 				stack = _bt_search(rel,
-								   IndexRelationGetNumberOfKeyAttributes(rel),
-								   itup_scankey, false, &lbuf, BT_READ, NULL);
+								   BTreeTupleGetNAtts(targetkey, rel),
+								   itup_scankey,
+								   BTreeTupleGetHeapTID(targetkey), false,
+								   &lbuf, BT_READ, NULL);
 				/* don't need a pin on the page */
 				_bt_relbuf(rel, lbuf);
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 0bcfa10b86..1e4a82bf77 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -94,8 +94,8 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
  * any incomplete splits encountered during the search will be finished.
  */
 BTStack
-_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
-		   Buffer *bufP, int access, Snapshot snapshot)
+_bt_search(Relation rel, int keysz, ScanKey scankey, ItemPointer scantid,
+		   bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
 {
 	BTStack		stack_in = NULL;
 
@@ -130,7 +130,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		 * if the leaf page is split and we insert to the parent page).  But
 		 * this is a good opportunity to finish splits of internal pages too.
 		 */
-		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, scantid, nextkey,
 							  (access == BT_WRITE), stack_in,
 							  BT_READ, snapshot);
 
@@ -144,7 +144,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		 * Find the appropriate item on the internal page, and get the child
 		 * page that it points to.
 		 */
-		offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
+		offnum = _bt_binsrch(rel, *bufP, keysz, scankey, scantid, nextkey);
 		itemid = PageGetItemId(page, offnum);
 		itup = (IndexTuple) PageGetItem(page, itemid);
 		blkno = BTreeInnerTupleGetDownLink(itup);
@@ -215,6 +215,7 @@ _bt_moveright(Relation rel,
 			  Buffer buf,
 			  int keysz,
 			  ScanKey scankey,
+			  ItemPointer scantid,
 			  bool nextkey,
 			  bool forupdate,
 			  BTStack stack,
@@ -275,7 +276,7 @@ _bt_moveright(Relation rel,
 			continue;
 		}
 
-		if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
+		if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, scantid, page, P_HIKEY) >= cmpval)
 		{
 			/* step right one page */
 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
@@ -324,6 +325,7 @@ _bt_binsrch(Relation rel,
 			Buffer buf,
 			int keysz,
 			ScanKey scankey,
+			ItemPointer scantid,
 			bool nextkey)
 {
 	Page		page;
@@ -371,7 +373,7 @@ _bt_binsrch(Relation rel,
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, keysz, scankey, page, mid);
+		result = _bt_compare(rel, keysz, scankey, scantid, page, mid);
 
 		if (result >= cmpval)
 			low = mid + 1;
@@ -428,24 +430,36 @@ int32
 _bt_compare(Relation rel,
 			int keysz,
 			ScanKey scankey,
+			ItemPointer scantid,
 			Page page,
 			OffsetNumber offnum)
 {
 	TupleDesc	itupdesc = RelationGetDescr(rel);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	ItemPointer  heapTid;
 	IndexTuple	itup;
+	int			ntupatts;
+	int			ncmpkey;
 	int			i;
 
+	Assert(keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
 	Assert(_bt_check_natts(rel, page, offnum));
 
 	/*
 	 * Force result ">" if target item is first data item on an internal page
 	 * --- see NOTE above.
+	 *
+	 * A minus infinity key has all attributes truncated away, so this test is
+	 * redundant with the minus infinity attribute tie-breaker.  However, the
+	 * number of attributes in minus infinity tuples was not explicitly
+	 * represented as 0 until PostgreSQL v11, so an explicit offnum test is
+	 * still required.
 	 */
 	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
 		return 1;
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	ntupatts = BTreeTupleGetNAtts(itup, rel);
 
 	/*
 	 * The scan key is set up with the attribute number associated with each
@@ -459,7 +473,8 @@ _bt_compare(Relation rel,
 	 * _bt_first).
 	 */
 
-	for (i = 1; i <= keysz; i++)
+	ncmpkey = Min(ntupatts, keysz);
+	for (i = 1; i <= ncmpkey; i++)
 	{
 		Datum		datum;
 		bool		isNull;
@@ -510,8 +525,69 @@ _bt_compare(Relation rel,
 		scankey++;
 	}
 
-	/* if we get here, the keys are equal */
-	return 0;
+	/*
+	 * Use the number of attributes as a tie-breaker, in order to treat
+	 * truncated attributes in index as minus infinity.
+	 */
+	if (keysz > ntupatts)
+		return 1;
+
+	/* If caller provided no heap TID tie-breaker for scan, they're equal */
+	if (!scantid)
+		return 0;
+
+	/*
+	 * Although it isn't counted as an attribute by BTreeTupleGetNAtts(), heap
+	 * TID is an implicit final key attribute that ensures that all index
+	 * tuples have a distinct set of key attribute values.
+	 *
+	 * This is often truncated away in pivot tuples, which makes the attribute
+	 * value implicitly negative infinity.
+	 */
+	heapTid = BTreeTupleGetHeapTID(itup);
+	if (!heapTid)
+		return 1;
+
+	/* Deliberately invert the order, since TIDs "sort DESC" */
+	return ItemPointerCompare(heapTid, scantid);
+}
+
+/*
+ * Return how many attributes to leave when truncating.
+ *
+ * This only considers key attributes, since non-key attributes should always
+ * be truncated away.  We only need attributes up to and including the first
+ * distinguishing attribute.
+ *
+ * This can return a number of attributes that is one greater than the number
+ * of key attributes actually found in the first right tuple.  This indicates
+ * that the caller must use the leftmost heap TID as a unique-ifier in its new
+ * high key tuple.
+ */
+int
+_bt_leave_natts(Relation rel, Page leftpage, OffsetNumber lastleftoffnum,
+				IndexTuple firstright)
+{
+	int			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	int			leavenatts;
+	ScanKey		skey;
+
+	skey = _bt_mkscankey(rel, firstright);
+
+	/*
+	 * Even test nkeyatts (untruncated) case, since caller cares about whether
+	 * or not it can avoid appending a heap TID as a unique-ifier
+	 */
+	for (leavenatts = 1; leavenatts <= nkeyatts; leavenatts++)
+	{
+		if (_bt_compare(rel, leavenatts, skey, NULL, leftpage, lastleftoffnum) > 0)
+			break;
+	}
+
+	/* Can't leak memory here */
+	_bt_freeskey(skey);
+
+	return leavenatts;
 }
 
 /*
@@ -1027,7 +1103,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * Use the manufactured insertion scan key to descend the tree and
 	 * position ourselves on the target leaf page.
 	 */
-	stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
+	stack = _bt_search(rel, keysCount, scankeys, NULL, nextkey, &buf, BT_READ,
 					   scan->xs_snapshot);
 
 	/* don't need to keep the stack around... */
@@ -1057,7 +1133,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	_bt_initialize_more_data(so, dir);
 
 	/* position to the precise item on the page */
-	offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
+	offnum = _bt_binsrch(rel, buf, keysCount, scankeys, NULL, nextkey);
 
 	/*
 	 * If nextkey = false, we are positioned at the first item >= scan key, or
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 16f5755777..6579021a04 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -796,8 +796,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	OffsetNumber last_off;
 	Size		pgspc;
 	Size		itupsz;
-	int			indnatts = IndexRelationGetNumberOfAttributes(wstate->index);
-	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index);
 
 	/*
 	 * This is a handy place to check for cancel interrupts during the btree
@@ -880,17 +878,17 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		ItemIdSetUnused(ii);	/* redundant */
 		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
 
-		if (indnkeyatts != indnatts && P_ISLEAF(opageop))
+		if (P_ISLEAF(opageop))
 		{
-			IndexTuple	truncated;
-			Size		truncsz;
+			OffsetNumber	lastleftoffnum = OffsetNumberPrev(last_off);
+			IndexTuple		truncated;
+			Size			truncsz;
 
 			/*
-			 * Truncate any non-key attributes from high key on leaf level
-			 * (i.e. truncate on leaf level if we're building an INCLUDE
-			 * index).  This is only done at the leaf level because downlinks
-			 * in internal pages are either negative infinity items, or get
-			 * their contents from copying from one level down.  See also:
+			 * Truncate away any unneeded attributes from high key on leaf
+			 * level.  This is only done at the leaf level because downlinks in
+			 * internal pages are either negative infinity items, or get their
+			 * contents from copying from one level down.  See also:
 			 * _bt_split().
 			 *
 			 * Since the truncated tuple is probably smaller than the
@@ -904,8 +902,12 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * only shift the line pointer array back and forth, and overwrite
 			 * the latter portion of the space occupied by the original tuple.
 			 * This is fairly cheap.
+			 *
+			 * TODO: Give a little weight to how large the final downlink will
+			 * be when deciding on a split point.
 			 */
-			truncated = _bt_nonkey_truncate(wstate->index, oitup);
+			truncated = _bt_suffix_truncate(wstate->index, opage,
+											lastleftoffnum, oitup);
 			truncsz = IndexTupleSize(truncated);
 			PageIndexTupleDelete(opage, P_HIKEY);
 			_bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
@@ -924,8 +926,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		if (state->btps_next == NULL)
 			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
 
-		Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) ==
-			   IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
+		Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <=
+				IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+				BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) ||
 			   P_LEFTMOST(opageop));
 		Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 ||
 			   !P_LEFTMOST(opageop));
@@ -970,7 +973,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	 * the first item for a page is copied from the prior page in the code
 	 * above.  Since the minimum key for an entire level is only used as a
 	 * minus infinity downlink, and never as a high key, there is no need to
-	 * truncate away non-key attributes at this point.
+	 * truncate away suffix attributes at this point.
 	 */
 	if (last_off == P_HIKEY)
 	{
@@ -1029,8 +1032,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		}
 		else
 		{
-			Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) ==
-				   IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
+			Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <=
+					IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
+					BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) ||
 				   P_LEFTMOST(opaque));
 			Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 ||
 				   !P_LEFTMOST(opaque));
@@ -1127,6 +1131,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 			}
 			else if (itup != NULL)
 			{
+				int32		compare = 0;
+
 				for (i = 1; i <= keysz; i++)
 				{
 					SortSupport entry;
@@ -1134,7 +1140,6 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 								attrDatum2;
 					bool		isNull1,
 								isNull2;
-					int32		compare;
 
 					entry = sortKeys + i - 1;
 					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
@@ -1151,6 +1156,22 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 					else if (compare < 0)
 						break;
 				}
+
+				/*
+				 * If key values are equal, we sort on ItemPointer.  This is
+				 * required for btree indexes, since heap TID is treated as an
+				 * implicit last key attribute in order to ensure that all keys
+				 * in the index are physically unique.
+				 *
+				 * Deliberately invert the order, since TIDs "sort DESC".
+				 */
+				if (compare == 0)
+				{
+					compare = ItemPointerCompare(&itup2->t_tid, &itup->t_tid);
+					Assert(compare != 0);
+					if (compare > 0)
+						load1 = false;
+				}
 			}
 			else
 				load1 = false;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index acb944357a..b9f9883bdd 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -56,27 +56,34 @@ static bool _bt_check_rowcompare(ScanKey skey,
  *		Build an insertion scan key that contains comparison data from itup
  *		as well as comparator routines appropriate to the key datatypes.
  *
- *		The result is intended for use with _bt_compare().
+ *		The result is intended for use with _bt_compare().  If itup has
+ *		undergone suffix truncation of key attributes, caller had better
+ *		pass BTreeTupleGetNAtts(itup, rel) as keysz to routines like
+ *		_bt_search() and _bt_compare() when using returned scan key.  This
+ *		allows truncated attributes to participate in comparisons (truncated
+ *		attributes have implicit negative infinity values).  Note that
+ *		_bt_compare() never treats a scan key as containing negative
+ *		infinity attributes.
  */
 ScanKey
 _bt_mkscankey(Relation rel, IndexTuple itup)
 {
 	ScanKey		skey;
 	TupleDesc	itupdesc;
+	int			tupnatts;
 	int			indnatts PG_USED_FOR_ASSERTS_ONLY;
 	int			indnkeyatts;
 	int16	   *indoption;
 	int			i;
 
 	itupdesc = RelationGetDescr(rel);
+	tupnatts = BTreeTupleGetNAtts(itup, rel);
 	indnatts = IndexRelationGetNumberOfAttributes(rel);
 	indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 	indoption = rel->rd_indoption;
 
-	Assert(indnkeyatts > 0);
-	Assert(indnkeyatts <= indnatts);
-	Assert(BTreeTupleGetNAtts(itup, rel) == indnatts ||
-		   BTreeTupleGetNAtts(itup, rel) == indnkeyatts);
+	Assert(tupnatts > 0);
+	Assert(tupnatts <= indnatts);
 
 	/*
 	 * We'll execute search using scan key constructed on key columns. Non-key
@@ -96,7 +103,21 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 		 * comparison can be needed.
 		 */
 		procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
-		arg = index_getattr(itup, i + 1, itupdesc, &null);
+
+		/*
+		 * Truncated key attributes may not be represented in index tuple
+		 * due to suffix truncation.  Keys built from truncated attributes
+		 * are defensively represented as NULL values, though they should
+		 * still not be allowed to participate in comparisons (caller must
+		 * be sure to pass a sane keysz to _bt_compare()).
+		 */
+		if (i < tupnatts)
+			arg = index_getattr(itup, i + 1, itupdesc, &null);
+		else
+		{
+			arg = (Datum) 0;
+			null = true;
+		}
 		flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
 		ScanKeyEntryInitializeWithInfo(&skey[i],
 									   flags,
@@ -2083,38 +2104,116 @@ btproperty(Oid index_oid, int attno,
 }
 
 /*
- *	_bt_nonkey_truncate() -- create tuple without non-key suffix attributes.
+ *	_bt_suffix_truncate() -- create tuple without unneeded suffix attributes.
  *
  * Returns truncated index tuple allocated in caller's memory context, with key
- * attributes copied from caller's itup argument.  Currently, suffix truncation
- * is only performed to create pivot tuples in INCLUDE indexes, but some day it
- * could be generalized to remove suffix attributes after the first
- * distinguishing key attribute.
+ * attributes copied from caller's itup argument.  If rel is an INCLUDE index,
+ * non-key attributes are always truncated away, since they're not part of the
+ * key space, and are not used in pivot tuples.  More aggressive suffix
+ * truncation can take place when it's clear that the returned tuple does not
+ * need one or more suffix key attributes.  This is possible when there are
+ * attributes after an already distinct pair of attributes.
  *
- * Truncated tuple is guaranteed to be no larger than the original, which is
- * important for staying under the 1/3 of a page restriction on tuple size.
+ * Truncated tuple is guaranteed to be no larger than the original plus space
+ * for an extra heap TID tie-breaker attribute, which is important for staying
+ * under the 1/3 of a page restriction on tuple size.
  *
  * Note that returned tuple's t_tid offset will hold the number of attributes
  * present, so the original item pointer offset is not represented.  Caller
- * should only change truncated tuple's downlink.
+ * should only change truncated tuple's downlink.  Note also that truncated key
+ * attributes are treated as containing "minus infinity" values by
+ * _bt_compare().
  */
 IndexTuple
-_bt_nonkey_truncate(Relation rel, IndexTuple itup)
+_bt_suffix_truncate(Relation rel, Page leftpage, OffsetNumber lastleftoffnum,
+					IndexTuple firstright)
 {
-	int			nkeyattrs = IndexRelationGetNumberOfKeyAttributes(rel);
-	IndexTuple	truncated;
+	TupleDesc		itupdesc = RelationGetDescr(rel);
+	int16			natts = IndexRelationGetNumberOfAttributes(rel);
+	int16			nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+	int				leavenatts;
+	IndexTuple		pivot;
+	ItemId			lastleftitem;
+	IndexTuple		lastlefttuple;
+	Size			newsize;
+
 
 	/*
-	 * We should only ever truncate leaf index tuples, which must have both
-	 * key and non-key attributes.  It's never okay to truncate a second time.
+	 * We should only ever truncate leaf index tuples, which must have non-key
+	 * attributes in the case of INCLUDE indexes.  It's never okay to truncate
+	 * a second time.
 	 */
-	Assert(BTreeTupleGetNAtts(itup, rel) ==
-		   IndexRelationGetNumberOfAttributes(rel));
+	Assert(BTreeTupleGetNAtts(firstright, rel) == natts);
+
+	/* Determine how many attributes must be left behind */
+	leavenatts = _bt_leave_natts(rel, leftpage, lastleftoffnum, firstright);
+
+	if (leavenatts <= natts)
+	{
+		IndexTuple		tidpivot;
+
+		/*
+		 * Truncate away non-key attributes and/or key attributes.  Do a
+		 * straight copy in the case where the only attribute to be "truncated
+		 * away" is the implicit heap TID key attribute (i.e. the case where we
+		 * can at least avoid adding an explicit heap TID attribute to new
+		 * pivot).
+		 */
+		if (leavenatts < natts)
+			pivot = index_truncate_tuple(itupdesc, firstright, leavenatts);
+		else
+			pivot = CopyIndexTuple(firstright);
 
-	truncated = index_truncate_tuple(RelationGetDescr(rel), itup, nkeyattrs);
-	BTreeTupleSetNAtts(truncated, nkeyattrs);
+		/*
+		 * If there is a distinguishing key attribute within leavenatts, there
+		 * is no need to add an explicit heap TID attribute to new pivot.
+		 */
+		if (leavenatts <= nkeyatts)
+		{
+			BTreeTupleSetNAtts(pivot, leavenatts);
+			return pivot;
+		}
 
-	return truncated;
+		/*
+		 * Only non-key attributes could be truncated away.  They are not
+		 * considered part of the key space, so it's still necessary to add a
+		 * heap TID attribute to the new pivot tuple.  Create enlarged copy of
+		 * truncated right tuple copy, to fit heap TID.
+		 */
+		newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
+		tidpivot = palloc0(newsize);
+		memcpy(tidpivot, pivot, IndexTupleSize(pivot));
+		pfree(pivot);
+		pivot = tidpivot;
+	}
+	else
+	{
+		/*
+		 * No truncation was possible. Create enlarged copy of first right
+		 * tuple, to fit heap TID
+		 */
+		newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData));
+		pivot = palloc0(newsize);
+		memcpy(pivot, firstright, IndexTupleSize(firstright));
+	}
+
+	/*
+	 * We must use heap TID as a unique-ifier in new pivot tuple, since no user
+	 * key attributes could be truncated away.  The heap TID must comes from
+	 * the last tuple on the left page, since new downlinks must be a strict
+	 * lower bound on new right page.
+	 */
+	pivot->t_info &= ~INDEX_SIZE_MASK;
+	pivot->t_info |= newsize;
+	/* Copy last left item's heap TID into new pivot tuple */
+	lastleftitem = PageGetItemId(leftpage, lastleftoffnum);
+	lastlefttuple = (IndexTuple) PageGetItem(leftpage, lastleftitem);
+	memcpy((char *) pivot + newsize - MAXALIGN(sizeof(ItemPointerData)),
+		   &lastlefttuple->t_tid, sizeof(ItemPointerData));
+	/* Tuple has all key attributes */
+	BTreeTupleSetNAtts(pivot, nkeyatts);
+	BTreeTupleSetHeapTID(pivot);
+	return pivot;
 }
 
 /*
@@ -2137,6 +2236,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 	int16		nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	IndexTuple	itup;
+	int			tupnatts;
 
 	/*
 	 * We cannot reliably test a deleted or half-deleted page, since they have
@@ -2156,6 +2256,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 					 "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	tupnatts = BTreeTupleGetNAtts(itup, rel);
 
 	if (P_ISLEAF(opaque))
 	{
@@ -2165,7 +2266,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 			 * Leaf tuples that are not the page high key (non-pivot tuples)
 			 * should never be truncated
 			 */
-			return BTreeTupleGetNAtts(itup, rel) == natts;
+			return tupnatts == natts;
 		}
 		else
 		{
@@ -2176,7 +2277,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 			Assert(!P_RIGHTMOST(opaque));
 
 			/* Page high key tuple contains only key attributes */
-			return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
+			return tupnatts > 0 && tupnatts <= nkeyatts;
 		}
 	}
 	else						/* !P_ISLEAF(opaque) */
@@ -2209,7 +2310,7 @@ _bt_check_natts(Relation rel, Page page, OffsetNumber offnum)
 			 * Tuple contains only key attributes despite on is it page high
 			 * key or not
 			 */
-			return BTreeTupleGetNAtts(itup, rel) == nkeyatts;
+			return tupnatts > 0 && tupnatts <= nkeyatts;
 		}
 
 	}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 67a94cb80a..f1d286c1ba 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -252,6 +252,9 @@ btree_xlog_split(bool onleft, bool lhighkey, XLogReaderState *record)
 	 * When the high key isn't present is the wal record, then we assume it to
 	 * be equal to the first key on the right page.  It must be from the leaf
 	 * level.
+	 *
+	 * FIXME:  We current always log the high key.  Is it worth trying to
+	 * salvage case where logging isn't strictly necessary, and can be avoided?
 	 */
 	if (!lhighkey)
 	{
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 9fb33b9035..2a0b64ad47 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -4057,23 +4057,26 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 	}
 
 	/*
-	 * If key values are equal, we sort on ItemPointer.  This does not affect
-	 * validity of the finished index, but it may be useful to have index
-	 * scans in physical order.
+	 * If key values are equal, we sort on ItemPointer.  This is required
+	 * for btree indexes, since heap TID is treated as an implicit last
+	 * key attribute in order to ensure that all keys in the index are
+	 * physically unique.
+	 *
+	 * Deliberately invert the order, since TIDs "sort DESC".
 	 */
 	{
 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
 
 		if (blk1 != blk2)
-			return (blk1 < blk2) ? -1 : 1;
+			return (blk1 < blk2) ? 1 : -1;
 	}
 	{
 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
 
 		if (pos1 != pos2)
-			return (pos1 < pos2) ? -1 : 1;
+			return (pos1 < pos2) ? 1 : -1;
 	}
 
 	return 0;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 04ecb4cbc0..f6208132b3 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -122,11 +122,21 @@ typedef struct BTMetaPageData
  *
  * We actually need to be able to fit three items on every page,
  * so restrict any one item to 1/3 the per-page available space.
+ *
+ * There are rare cases where _bt_suffix_truncate() will need to enlarge
+ * a heap index tuple to make space for a tie-breaker heap TID
+ * attribute, which we account for here.
  */
-#define BTMaxItemSize(page) \
+#define BTMaxItemSizeOld(page) \
 	MAXALIGN_DOWN((PageGetPageSize(page) - \
 				   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+#define BTMaxItemSize(page) \
+	MAXALIGN_DOWN((PageGetPageSize(page) - \
+				   MAXALIGN(SizeOfPageHeaderData + \
+							3*sizeof(ItemIdData)  + \
+							3*MAXALIGN(sizeof(ItemPointerData))) - \
+				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
 
 /*
  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
@@ -204,12 +214,10 @@ typedef struct BTMetaPageData
  * real offset (downlinks only need to store a block number).  The offset
  * field only stores the number of attributes when the INDEX_ALT_TID_MASK
  * bit is set (we never assume that pivot tuples must explicitly store the
- * number of attributes, and currently do not bother storing the number of
- * attributes unless indnkeyatts actually differs from indnatts).
- * INDEX_ALT_TID_MASK is only used for pivot tuples at present, though it's
- * possible that it will be used within non-pivot tuples in the future.  Do
- * not assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot
- * tuple.
+ * number of attributes).  INDEX_ALT_TID_MASK is only used for pivot tuples
+ * at present, though it's possible that it will be used within non-pivot
+ * tuples in the future.  Do not assume that a tuple with INDEX_ALT_TID_MASK
+ * set must be a pivot tuple.
  *
  * The 12 least significant offset bits are used to represent the number of
  * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 bits that are reserved
@@ -219,6 +227,8 @@ typedef struct BTMetaPageData
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
 #define BT_RESERVED_OFFSET_MASK		0xF000
 #define BT_N_KEYS_OFFSET_MASK		0x0FFF
+/* Reserved to indicate if heap TID is represented in pivot tuple */
+#define BT_HEAP_TID_ATTR			0x1000
 
 /* Get/set downlink block number */
 #define BTreeInnerTupleGetDownLink(itup) \
@@ -241,14 +251,15 @@ typedef struct BTMetaPageData
 	} while(0)
 
 /*
- * Get/set number of attributes within B-tree index tuple. Asserts should be
- * removed when BT_RESERVED_OFFSET_MASK bits will be used.
+ * Get/set number of attributes within B-tree index tuple.
+ *
+ * Note that this does not include an implicit tie-breaker heap-TID
+ * attribute, if any.
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
 		(itup)->t_info & INDEX_ALT_TID_MASK ? \
 		( \
-			AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \
 			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
 		) \
 		: \
@@ -257,10 +268,32 @@ typedef struct BTMetaPageData
 #define BTreeTupleSetNAtts(itup, n) \
 	do { \
 		(itup)->t_info |= INDEX_ALT_TID_MASK; \
-		Assert(((n) & BT_RESERVED_OFFSET_MASK) == 0); \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
 	} while(0)
 
+/*
+ * Get/set implicit tie-breaker heap-TID attribute, if any.
+ *
+ * Assumes that any tuple without INDEX_ALT_TID_MASK set has t_tid that
+ * points to heap.
+ */
+#define BTreeTupleGetHeapTID(itup) \
+	( \
+	  (itup)->t_info & INDEX_ALT_TID_MASK && \
+	  (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \
+	  ( \
+		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
+					   MAXALIGN(sizeof(ItemPointerData))) \
+	  ) \
+	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &(itup)->t_tid \
+	)
+#define BTreeTupleSetHeapTID(itup) \
+	do { \
+		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
+	} while(0)
+
 /*
  *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
  *	because many places need to use them in ScanKeyInit() calls.
@@ -560,15 +593,17 @@ extern int	_bt_pagedel(Relation rel, Buffer buf);
  * prototypes for functions in nbtsearch.c
  */
 extern BTStack _bt_search(Relation rel,
-		   int keysz, ScanKey scankey, bool nextkey,
+		   int keysz, ScanKey scankey, ItemPointer scantid, bool nextkey,
 		   Buffer *bufP, int access, Snapshot snapshot);
 extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
-			  ScanKey scankey, bool nextkey, bool forupdate, BTStack stack,
-			  int access, Snapshot snapshot);
+			  ScanKey scankey, ItemPointer scantid, bool nextkey,
+			  bool forupdate, BTStack stack, int access, Snapshot snapshot);
 extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
-			ScanKey scankey, bool nextkey);
+			ScanKey scankey, ItemPointer scantid, bool nextkey);
 extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
-			Page page, OffsetNumber offnum);
+			ItemPointer scantid, Page page, OffsetNumber offnum);
+extern int _bt_leave_natts(Relation rel, Page leftpage,
+						   OffsetNumber lastleftoffnum, IndexTuple firstright);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
@@ -601,7 +636,9 @@ extern bytea *btoptions(Datum reloptions, bool validate);
 extern bool btproperty(Oid index_oid, int attno,
 		   IndexAMProperty prop, const char *propname,
 		   bool *res, bool *isnull);
-extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup);
+extern IndexTuple _bt_suffix_truncate(Relation rel, Page leftpage,
+									  OffsetNumber lastleftoffnum,
+									  IndexTuple firstright);
 extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum);
 
 /*
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index dc6262be43..2c20cea4b9 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -5896,8 +5896,8 @@ inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
 where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
  id1 | id2 | id1 | id2 
 -----+-----+-----+-----
-   1 |   1 |   1 |   1
    1 |   2 |   1 |   2
+   1 |   1 |   1 |   1
 (2 rows)
 
 reset enable_nestloop;
-- 
2.14.1