From 0d7e3f8c172c9ca7468a50063c1ec66d19d71a44 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 17 Apr 2019 20:17:19 -0700
Subject: [PATCH v2] Prevent O(N^2) unique index insertion edge case.

Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (leaf
page and offset number) for every newly inserted index tuple, no matter
what.  Insertions into non-unique indexes can proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes must be more careful.  The TID value in
scantid must initially be omitted to reliably visit every leaf page that
duplicates could be on, which is needed by unique checking.  The scantid
is filled in after unique checking finishes successfully, which
occasionally forces _bt_findinsertloc() to step right to find the leaf
page that the new tuple must be inserted on.

Unique checking typically has very little overhead, since in practice it
is hardly ever necessary to actually step to the right within
_bt_findinsertloc().  There seems to be no realistic way that an
accretion of _bt_check_unique()-wise duplicates of a single value (or
set of values) can slow down insertions of the same value over time.
Even if there are duplicates of a single value that span multiple leaf
pages, it's inherently necessary to visit those leaf pages anyway, to
check visibility information from the heap.  Visiting the same leaf
pages a second time (when we get to _bt_findinsertloc()) is not
expensive in comparison.

However, there is an important distinction that commit dd299df8
overlooked: the distinction between duplicates that have no NULL values
(tuples that can be involved in unique violation errors), and tuples
with NULL values.  A huge number of duplicates with NULL values are
common with some workloads, which caused problematic O(N^2) behavior
within _bt_findinsertloc()'s heapkeyspace path for later insertions of
tuples of the same value.  In extreme cases, _bt_findinsertloc() could
end up grovelling through every leaf page in the index.

To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index:

* Never set scantid to NULL before calling _bt_search() to descend the
  tree when the new tuple contains a NULL value.

* Never perform unique checking.

This allows us to remove code that was used by _bt_check_unique() to
handle NULL values.

Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
---
 src/backend/access/nbtree/nbtinsert.c | 111 +++++++++-----------------
 src/backend/access/nbtree/nbtsearch.c |   1 +
 src/backend/access/nbtree/nbtutils.c  |   3 +
 src/include/access/nbtree.h           |   8 ++
 4 files changed, 50 insertions(+), 73 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 4fda0efe9a..c80e6cf6ce 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -55,8 +55,6 @@ static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 				  BTStack stack, bool is_root, bool is_only);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 			 OffsetNumber itup_off);
-static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key,
-			Page page, OffsetNumber offnum);
 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
 
 /*
@@ -91,9 +89,30 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 
 	/* we need an insertion scan key to do our search, so build one */
 	itup_key = _bt_mkscankey(rel, itup);
-	/* No scantid until uniqueness established in checkingunique case */
-	if (checkingunique && itup_key->heapkeyspace)
-		itup_key->scantid = NULL;
+
+	if (checkingunique)
+	{
+		/* No scantid (for heapkeyspace case) until uniqueness established */
+		if (!itup_key->anynullkeys)
+			itup_key->scantid = NULL;
+		else
+		{
+			/*
+			 * Bypass extra checkingunique steps when new tuple contains NULL
+			 * key value, and assume that it is "unique" for core code's
+			 * purposes.
+			 *
+			 * This optimization helps insertions into a unique index with
+			 * many duplicate tuples with NULLs.  Descending directly to the
+			 * page that we'll ultimately insert on (by keeping scantid set)
+			 * prevents O(N^2) behavior within the _bt_findinsertloc()
+			 * heapkeyspace path.
+			 */
+			Assert(checkUnique != UNIQUE_CHECK_EXISTING);
+			is_unique = true;
+			checkingunique = false;
+		}
+	}
 
 	/*
 	 * Fill in the BTInsertState working area, to track the current page and
@@ -203,8 +222,8 @@ top:
 	buf = InvalidBuffer;		/* insertstate.buf now owns the buffer */
 
 	/*
-	 * If we're not allowing duplicates, make sure the key isn't already in
-	 * the index.
+	 * If we're not allowing duplicates, and scan key has no NULLs, make sure
+	 * the key isn't already in the index.
 	 *
 	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
 	 * in the index; so it cannot defend against concurrent insertions of the
@@ -300,6 +319,10 @@ top:
 /*
  *	_bt_check_unique() -- Check for violation of unique index constraint
  *
+ * Do not call here for insertions with NULL values in scan key.  NULL values
+ * are not considered equal to each other when checking duplicates, so there
+ * cannot be a unique violation.
+ *
  * Returns InvalidTransactionId if there is no conflict, else an xact ID
  * we must wait for to see if it commits a conflicting tuple.   If an actual
  * conflict is detected, no return --- just ereport().  If an xact ID is
@@ -321,7 +344,6 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 IndexUniqueCheck checkUnique, bool *is_unique,
 				 uint32 *speculativeToken)
 {
-	TupleDesc	itupdesc = RelationGetDescr(rel);
 	IndexTuple	itup = insertstate->itup;
 	BTScanInsert itup_key = insertstate->itup_key;
 	SnapshotData SnapshotDirty;
@@ -354,6 +376,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	 * Scan over all equal tuples, looking for live conflicts.
 	 */
 	Assert(!insertstate->bounds_valid || insertstate->low == offset);
+	Assert(!itup_key->anynullkeys);
 	Assert(itup_key->scantid == NULL);
 	for (;;)
 	{
@@ -375,16 +398,16 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			 * original page, which may indicate that we need to examine a
 			 * second or subsequent page.
 			 *
-			 * Note that this optimization avoids calling _bt_isequal()
-			 * entirely when there are no duplicates, as long as the offset
-			 * where the key will go is not at the end of the page.
+			 * Note that this optimization allows us to avoid calling
+			 * _bt_compare() directly when there are no duplicates, as long as
+			 * the offset where the key will go is not at the end of the page.
 			 */
 			if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
 			{
 				Assert(insertstate->bounds_valid);
 				Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
 				Assert(insertstate->low <= insertstate->stricthigh);
-				Assert(!_bt_isequal(itupdesc, itup_key, page, offset));
+				Assert(_bt_compare(rel, itup_key, page, offset) < 0);
 				break;
 			}
 
@@ -394,9 +417,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			 * We can skip items that are marked killed.
 			 *
 			 * In the presence of heavy update activity an index may contain
-			 * many killed items with the same key; running _bt_isequal() on
+			 * many killed items with the same key; running _bt_compare() on
 			 * each killed item gets expensive.  Just advance over killed
-			 * items as quickly as we can.  We only apply _bt_isequal() when
+			 * items as quickly as we can.  We only apply _bt_compare() when
 			 * we get to a non-killed item.  Even those comparisons could be
 			 * avoided (in the common case where there is only one page to
 			 * visit) by reusing bounds, but just skipping dead items is fast
@@ -407,13 +430,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				ItemPointerData htid;
 				bool		all_dead;
 
-				/*
-				 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
-				 * how we handling NULLs - and so we must not use _bt_compare
-				 * in real comparison, but only for ordering/finding items on
-				 * pages. - vadim 03/24/97
-				 */
-				if (!_bt_isequal(itupdesc, itup_key, page, offset))
+				if (_bt_compare(rel, itup_key, page, offset) != 0)
 					break;		/* we're past all the equal tuples */
 
 				/* okay, we gotta fetch the heap tuple ... */
@@ -2184,58 +2201,6 @@ _bt_pgaddtup(Page page,
 	return true;
 }
 
-/*
- * _bt_isequal - used in _bt_doinsert in check for duplicates.
- *
- * This is very similar to _bt_compare, except for NULL and negative infinity
- * handling.  Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
- */
-static bool
-_bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
-			OffsetNumber offnum)
-{
-	IndexTuple	itup;
-	ScanKey		scankey;
-	int			i;
-
-	/* Better be comparing to a non-pivot item */
-	Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
-	Assert(offnum >= P_FIRSTDATAKEY((BTPageOpaque) PageGetSpecialPointer(page)));
-	Assert(itup_key->scantid == NULL);
-
-	scankey = itup_key->scankeys;
-	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-
-	for (i = 1; i <= itup_key->keysz; i++)
-	{
-		AttrNumber	attno;
-		Datum		datum;
-		bool		isNull;
-		int32		result;
-
-		attno = scankey->sk_attno;
-		Assert(attno == i);
-		datum = index_getattr(itup, attno, itupdesc, &isNull);
-
-		/* NULLs are never equal to anything */
-		if (isNull || (scankey->sk_flags & SK_ISNULL))
-			return false;
-
-		result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
-												 scankey->sk_collation,
-												 datum,
-												 scankey->sk_argument));
-
-		if (result != 0)
-			return false;
-
-		scankey++;
-	}
-
-	/* if we get here, the keys are equal */
-	return true;
-}
-
 /*
  * _bt_vacuum_one_page - vacuum just one index page.
  *
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 8a5c6b3f24..5906c41f31 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1235,6 +1235,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 	/* Initialize remaining insertion scan key fields */
 	inskey.heapkeyspace = _bt_heapkeyspace(rel);
+	inskey.anynullkeys = false;		/* unusued */
 	inskey.nextkey = nextkey;
 	inskey.pivotsearch = false;
 	inskey.scantid = NULL;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7e409d616f..d91f40160f 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -107,6 +107,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	key = palloc(offsetof(BTScanInsertData, scankeys) +
 				 sizeof(ScanKeyData) * indnkeyatts);
 	key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel);
+	key->anynullkeys = false;		/* initial assumption */
 	key->nextkey = false;
 	key->pivotsearch = false;
 	key->keysz = Min(indnkeyatts, tupnatts);
@@ -147,6 +148,8 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 									   rel->rd_indcollation[i],
 									   procinfo,
 									   arg);
+		/* Record if any key attribute is NULL (or truncated) */
+		key->anynullkeys = key->anynullkeys || null;
 	}
 
 	return key;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index fbc8134cfd..d5eee158f8 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -435,6 +435,13 @@ typedef BTStackData *BTStack;
  * indexes whose version is >= version 4.  It's convenient to keep this close
  * by, rather than accessing the metapage repeatedly.
  *
+ * anynullkeys indicates if any of the scan keys have NULL values (or
+ * originated from truncated attributes) when scankey was built from index
+ * tuple.  This is a redundant convenience variable.  It's almost equivalent
+ * to testing original (known untruncated) tuple's header for HasNulls bit,
+ * but not quite, because its value is unaffected by NULL-ness of non-key
+ * attributes in INCLUDE indexes.
+ *
  * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
  * locate the first item >= scankey.  When nextkey is true, they will locate
  * the first item > scan key.
@@ -461,6 +468,7 @@ typedef BTStackData *BTStack;
 typedef struct BTScanInsertData
 {
 	bool		heapkeyspace;
+	bool		anynullkeys;
 	bool		nextkey;
 	bool		pivotsearch;
 	ItemPointer scantid;		/* tiebreaker for scankeys */
-- 
2.17.1