Re: OOM in spgist insert

Tom Lane Thu, 13 May 2021 18:01:40 -0700

Alvaro Herrera <alvhe...@alvh.no-ip.org> writes:
> On 2021-May-13, Tom Lane wrote:
>> What do people think about back-patching this?  In existing branches,
>> we've defined it to be an opclass bug if it fails to shorten the leaf
>> datum enough.  But not having any defenses against that seems like
>> not a great idea.  OTOH, the 10-cycles-to-show-progress rule could be
>> argued to be an API break.


> I think if the alternative is to throw an error, we can afford to retry
> quite a few more times than 10 in order not have that called an API
> break.  Say, retry (MAXIMUM_ALIGNOF << 3) times or so (if you want to
> parameterize on maxalign).  It's not like this is going to be a
> performance drag where not needed .. but I think leaving back-branches
> unfixed is not great.

Hm.  Index bloat is not something to totally ignore, though, so I'm
not sure what the best cutoff is.

Anyway, here is a patch set teased apart into committable bites,
and with your other points addressed.

                        regards, tom lane

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 6200699ddd..dd2ade7bb6 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -554,7 +554,7 @@ ProcessClientWriteInterrupt(bool blocked)
 		{
 			/*
 			 * Don't mess with whereToSendOutput if ProcessInterrupts wouldn't
-			 * do anything.
+			 * service ProcDiePending.
 			 */
 			if (InterruptHoldoffCount == 0 && CritSectionCount == 0)
 			{
@@ -3118,6 +3118,12 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
  * If an interrupt condition is pending, and it's safe to service it,
  * then clear the flag and accept the interrupt.  Called only when
  * InterruptPending is true.
+ *
+ * Note: if INTERRUPTS_CAN_BE_PROCESSED() is true, then ProcessInterrupts
+ * is guaranteed to clear the InterruptPending flag before returning.
+ * (This is not the same as guaranteeing that it's still clear when we
+ * return; another interrupt could have arrived.  But we promise that
+ * any pre-existing one will have been serviced.)
  */
 void
 ProcessInterrupts(void)
@@ -3248,7 +3254,11 @@ ProcessInterrupts(void)
 	{
 		/*
 		 * Re-arm InterruptPending so that we process the cancel request as
-		 * soon as we're done reading the message.
+		 * soon as we're done reading the message.  (XXX this is seriously
+		 * ugly: it complicates INTERRUPTS_CAN_BE_PROCESSED(), and it means we
+		 * can't use that macro directly as the initial test in this function,
+		 * meaning that this code also creates opportunities for other bugs to
+		 * appear.)
 		 */
 		InterruptPending = true;
 	}
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 95202d37af..b95bb956b6 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -57,6 +57,15 @@
  * allowing die interrupts: HOLD_CANCEL_INTERRUPTS() and
  * RESUME_CANCEL_INTERRUPTS().
  *
+ * Note that ProcessInterrupts() has also acquired a number of tasks that
+ * do not necessarily cause a query-cancel-or-die response.  Hence, it's
+ * possible that it will just clear InterruptPending and return.
+ *
+ * INTERRUPTS_PENDING_CONDITION() can be checked to see whether an
+ * interrupt needs to be serviced, without trying to do so immediately.
+ * Some callers are also interested in INTERRUPTS_CAN_BE_PROCESSED(),
+ * which tells whether ProcessInterrupts is sure to clear the interrupt.
+ *
  * Special mechanisms are used to let an interrupt be accepted when we are
  * waiting for a lock or when we are waiting for command input (but, of
  * course, only if the interrupt holdoff counter is zero).  See the
@@ -97,24 +106,27 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount;
 /* in tcop/postgres.c */
 extern void ProcessInterrupts(void);
 
+/* Test whether an interrupt is pending */
 #ifndef WIN32
+#define INTERRUPTS_PENDING_CONDITION() \
+	(unlikely(InterruptPending))
+#else
+#define INTERRUPTS_PENDING_CONDITION() \
+	(unlikely(UNBLOCKED_SIGNAL_QUEUE()) ? pgwin32_dispatch_queued_signals() : 0,  \
+	 unlikely(InterruptPending))
+#endif
 
+/* Service interrupt if one is pending */
 #define CHECK_FOR_INTERRUPTS() \
 do { \
-	if (unlikely(InterruptPending)) \
-		ProcessInterrupts(); \
-} while(0)
-#else							/* WIN32 */
-
-#define CHECK_FOR_INTERRUPTS() \
-do { \
-	if (unlikely(UNBLOCKED_SIGNAL_QUEUE())) \
-		pgwin32_dispatch_queued_signals(); \
-	if (unlikely(InterruptPending)) \
+	if (INTERRUPTS_PENDING_CONDITION()) \
 		ProcessInterrupts(); \
 } while(0)
-#endif							/* WIN32 */
 
+/* Is ProcessInterrupts() guaranteed to clear InterruptPending? */
+#define INTERRUPTS_CAN_BE_PROCESSED() \
+	(InterruptHoldoffCount == 0 && CritSectionCount == 0 && \
+	 QueryCancelHoldoffCount == 0)
 
 #define HOLD_INTERRUPTS()  (InterruptHoldoffCount++)

diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 4d380c99f0..88ae2499dd 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -1905,13 +1905,14 @@ spgSplitNodeAction(Relation index, SpGistState *state,
  * Insert one item into the index.
  *
  * Returns true on success, false if we failed to complete the insertion
- * because of conflict with a concurrent insert.  In the latter case,
- * caller should re-call spgdoinsert() with the same args.
+ * (typically because of conflict with a concurrent insert).  In the latter
+ * case, caller should re-call spgdoinsert() with the same args.
  */
 bool
 spgdoinsert(Relation index, SpGistState *state,
 			ItemPointer heapPtr, Datum *datums, bool *isnulls)
 {
+	bool		result = true;
 	TupleDesc	leafDescriptor = state->leafTupDesc;
 	bool		isnull = isnulls[spgKeyColumn];
 	int			level = 0;
@@ -2012,6 +2013,14 @@ spgdoinsert(Relation index, SpGistState *state,
 	parent.offnum = InvalidOffsetNumber;
 	parent.node = -1;
 
+	/*
+	 * Before entering the loop, try to clear any pending interrupt condition.
+	 * If a query cancel is pending, we might as well accept it now not later;
+	 * while if a non-canceling condition is pending, servicing it here avoids
+	 * having to restart the insertion and redo all the work so far.
+	 */
+	CHECK_FOR_INTERRUPTS();
+
 	for (;;)
 	{
 		bool		isNew = false;
@@ -2019,9 +2028,18 @@ spgdoinsert(Relation index, SpGistState *state,
 		/*
 		 * Bail out if query cancel is pending.  We must have this somewhere
 		 * in the loop since a broken opclass could produce an infinite
-		 * picksplit loop.
+		 * picksplit loop.  However, because we'll be holding buffer lock(s)
+		 * after the first iteration, ProcessInterrupts() wouldn't be able to
+		 * throw a cancel error here.  Hence, if we see that an interrupt is
+		 * pending, break out of the loop and deal with the situation below.
+		 * Set result = false because we must restart the insertion if the
+		 * interrupt isn't a query-cancel-or-die case.
 		 */
-		CHECK_FOR_INTERRUPTS();
+		if (INTERRUPTS_PENDING_CONDITION())
+		{
+			result = false;
+			break;
+		}
 
 		if (current.blkno == InvalidBlockNumber)
 		{
@@ -2140,10 +2158,14 @@ spgdoinsert(Relation index, SpGistState *state,
 			 * spgAddNode and spgSplitTuple cases will loop back to here to
 			 * complete the insertion operation.  Just in case the choose
 			 * function is broken and produces add or split requests
-			 * repeatedly, check for query cancel.
+			 * repeatedly, check for query cancel (see comments above).
 			 */
 	process_inner_tuple:
-			CHECK_FOR_INTERRUPTS();
+			if (INTERRUPTS_PENDING_CONDITION())
+			{
+				result = false;
+				break;
+			}
 
 			innerTuple = (SpGistInnerTuple) PageGetItem(current.page,
 														PageGetItemId(current.page, current.offnum));
@@ -2267,5 +2289,21 @@ spgdoinsert(Relation index, SpGistState *state,
 		UnlockReleaseBuffer(parent.buffer);
 	}
 
-	return true;
+	/*
+	 * We do not support being called while some outer function is holding a
+	 * buffer lock (or any other reason to postpone query cancels).  If that
+	 * were the case, telling the caller to retry would create an infinite
+	 * loop.
+	 */
+	Assert(INTERRUPTS_CAN_BE_PROCESSED());
+
+	/*
+	 * Finally, check for interrupts again.  If there was a query cancel,
+	 * ProcessInterrupts() will be able to throw the error here.  If it was
+	 * some other kind of interrupt that can just be cleared, return false to
+	 * tell our caller to retry.
+	 */
+	CHECK_FOR_INTERRUPTS();
+
+	return result;
 }

diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 88ae2499dd..520983ede1 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -669,7 +669,8 @@ checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig,
  * will eventually terminate if lack of balance is the issue.  If the tuple
  * is too big, we assume that repeated picksplit operations will eventually
  * make it small enough by repeated prefix-stripping.  A broken opclass could
- * make this an infinite loop, though.
+ * make this an infinite loop, though, so spgdoinsert() checks that the
+ * leaf datums get smaller each time.
  */
 static bool
 doPickSplit(Relation index, SpGistState *state,
@@ -1918,6 +1919,8 @@ spgdoinsert(Relation index, SpGistState *state,
 	int			level = 0;
 	Datum		leafDatums[INDEX_MAX_KEYS];
 	int			leafSize;
+	int			prevLeafSize;
+	int			numNoProgressCycles = 0;
 	SPPageDesc	current,
 				parent;
 	FmgrInfo   *procinfo = NULL;
@@ -1988,9 +1991,10 @@ spgdoinsert(Relation index, SpGistState *state,
 
 	/*
 	 * If it isn't gonna fit, and the opclass can't reduce the datum size by
-	 * suffixing, bail out now rather than getting into an endless loop.
+	 * suffixing, bail out now rather than doing a lot of useless work.
 	 */
-	if (leafSize > SPGIST_PAGE_CAPACITY && !state->config.longValuesOK)
+	if (leafSize > SPGIST_PAGE_CAPACITY &&
+		(isnull || !state->config.longValuesOK))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
@@ -2218,6 +2222,7 @@ spgdoinsert(Relation index, SpGistState *state,
 					/* Adjust level as per opclass request */
 					level += out.result.matchNode.levelAdd;
 					/* Replace leafDatum and recompute leafSize */
+					prevLeafSize = leafSize;
 					if (!isnull)
 					{
 						leafDatums[spgKeyColumn] = out.result.matchNode.restDatum;
@@ -2226,19 +2231,50 @@ spgdoinsert(Relation index, SpGistState *state,
 						leafSize += sizeof(ItemIdData);
 					}
 
+					/*
+					 * Check new tuple size; fail if it can't fit, unless the
+					 * opclass says it can handle the situation by suffixing.
+					 * In that case, prevent an infinite loop by checking to
+					 * see if we are actually making progress by reducing the
+					 * leafSize size in each pass.  This is a bit tricky
+					 * though.  Because of alignment considerations, the total
+					 * tuple size might not decrease on every pass.  Also,
+					 * there are edge cases where the choose method might seem
+					 * to not make progress for a cycle or two.  Somewhat
+					 * arbitrarily, we allow up to 10 no-progress iterations
+					 * before failing.  (The limit should be more than
+					 * MAXALIGN, to accommodate opclasses that trim one byte
+					 * from the leaf datum per pass.)
+					 */
+					if (leafSize > SPGIST_PAGE_CAPACITY)
+					{
+						bool		ok = false;
+
+						if (state->config.longValuesOK && !isnull)
+						{
+							if (leafSize < prevLeafSize)
+							{
+								ok = true;
+								numNoProgressCycles = 0;
+							}
+							else if (++numNoProgressCycles < 10)
+								ok = true;
+						}
+						if (!ok)
+							ereport(ERROR,
+									(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+									 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+											leafSize - sizeof(ItemIdData),
+											SPGIST_PAGE_CAPACITY - sizeof(ItemIdData),
+											RelationGetRelationName(index)),
+									 errhint("Values larger than a buffer page cannot be indexed.")));
+					}
+
 					/*
 					 * Loop around and attempt to insert the new leafDatum at
 					 * "current" (which might reference an existing child
 					 * tuple, or might be invalid to force us to find a new
 					 * page for the tuple).
-					 *
-					 * Note: if the opclass sets longValuesOK, we rely on the
-					 * choose function to eventually shorten the leafDatum
-					 * enough to fit on a page.  We could add a test here to
-					 * complain if the datum doesn't get visibly shorter each
-					 * time, but that could get in the way of opclasses that
-					 * "simplify" datums in a way that doesn't necessarily
-					 * lead to physical shortening on every cycle.
 					 */
 					break;
 				case spgAddNode:
diff --git a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
index 0ac99d08fb..ac0ddcecea 100644
--- a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
+++ b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
@@ -61,6 +61,12 @@ select * from t
  binary_upgrade_set_next_toast_pg_class_oid           |  1 | binary_upgrade_set_next_toast_pg_class_oid
 (9 rows)
 
+-- Verify clean failure when INCLUDE'd columns result in overlength tuple
+-- The error message details are platform-dependent, so show only SQLSTATE
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+ERROR:  54000
+\set VERBOSITY default
 drop index t_f1_f2_f3_idx;
 create index on t using spgist(f1 name_ops_old) include(f2, f3);
 \d+ t_f1_f2_f3_idx
@@ -100,3 +106,7 @@ select * from t
  binary_upgrade_set_next_toast_pg_class_oid           |  1 | binary_upgrade_set_next_toast_pg_class_oid
 (9 rows)
 
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+ERROR:  54000
+\set VERBOSITY default
diff --git a/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql b/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
index 76e78ba41c..982f221a8b 100644
--- a/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
+++ b/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
@@ -27,6 +27,12 @@ select * from t
   where f1 > 'binary_upgrade_set_n' and f1 < 'binary_upgrade_set_p'
   order by 1;
 
+-- Verify clean failure when INCLUDE'd columns result in overlength tuple
+-- The error message details are platform-dependent, so show only SQLSTATE
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+\set VERBOSITY default
+
 drop index t_f1_f2_f3_idx;
 
 create index on t using spgist(f1 name_ops_old) include(f2, f3);
@@ -39,3 +45,7 @@ select * from t
 select * from t
   where f1 > 'binary_upgrade_set_n' and f1 < 'binary_upgrade_set_p'
   order by 1;
+
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+\set VERBOSITY default

Re: OOM in spgist insert

Reply via email to