On Tue, Sep 29, 2020 at 9:12 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> On Tue, Sep 29, 2020 at 7:11 PM Michael Paquier <mich...@paquier.xyz> wrote:
> > #2  0x00000000009027d2 in ExceptionalCondition
> > (conditionName=conditionName@entry=0xa80846 "!barrier->static_party",
>
> > #4  0x0000000000682ebf in ExecParallelHashJoinNewBatch
>
> Thanks.  Ohhh.  I think I see how that condition was reached and what
> to do about it, but I'll need to look more closely.  I'm away on
> vacation right now, and will update in a couple of days when I'm back
> at a real computer.

Here's a throw-away patch to add some sleeps that trigger the problem,
and a first draft fix.  I'll do some more testing of this next week
and see if I can simplify it.
From 65f70e0be36ec61e1993907162cfd4edac46e063 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 2 Oct 2020 15:24:23 +1300
Subject: [PATCH 1/2] Inject fault timing

---
 src/backend/executor/nodeHash.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index ea69eeb2a1..244805e69b 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -25,6 +25,7 @@
 
 #include <math.h>
 #include <limits.h>
+#include <unistd.h>
 
 #include "access/htup_details.h"
 #include "access/parallel.h"
@@ -585,6 +586,13 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 		ParallelHashJoinState *pstate = hashtable->parallel_state;
 		Barrier    *build_barrier;
 
+		if (ParallelWorkerNumber >= 1)
+		{
+			elog(LOG, "a short nap before attaching to build_barrier...");
+			sleep(2);
+			elog(LOG, "nap finished");
+		}
+
 		/*
 		 * Attach to the build barrier.  The corresponding detach operation is
 		 * in ExecHashTableDetach.  Note that we won't attach to the
@@ -3198,6 +3206,9 @@ ExecHashTableDetach(HashJoinTable hashtable)
 			if (DsaPointerIsValid(pstate->batches))
 			{
 				dsa_free(hashtable->area, pstate->batches);
+				elog(LOG, "batch array freed, taking a long nap...");
+				sleep(5);
+				elog(LOG, "finished nap, clearing pointer");
 				pstate->batches = InvalidDsaPointer;
 			}
 		}
-- 
2.20.1

From 21f745905dcaff738326434943e70c4292aae4a4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 2 Oct 2020 15:53:44 +1300
Subject: [PATCH 2/2] Fix race condition in parallel hash join batch cleanup.

With unlucky timing and parallel_leader_participation off, PHJ could
attempt to access per-batch state just as it was being freed.  There was
code intended to prevent that by checking for a cleared pointer, but it
was racy.  Fix, by introducing an extra barrier phase.  The new phase
PHJ_BUILD_RUNNING means that it's safe to access the per-batch state to
find a batch to help with, and PHJ_BUILD_DONE means that it is too late.
The last to detach will free the array of per-batch state as before, but
now it will also atomically advance the phase at the same time, so that
late attachers can avoid the hazard.  This mirrors the way per-batch
hash tables are freed (see phases PHJ_BATCH_PROBING and PHJ_BATCH_DONE).

Revealed by a build farm failure, where BarrierAttach() failed a sanity
check assertion, because the memory had been clobbered by dsa_free().

Discussion: https://postgr.es/m/20200929061142.GA29096%40paquier.xyz
---
 src/backend/executor/nodeHash.c     | 47 ++++++++++++++++++++---------
 src/backend/executor/nodeHashjoin.c | 38 +++++++++++++----------
 src/include/executor/hashjoin.h     |  3 +-
 3 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 244805e69b..b1013b452b 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -334,14 +334,21 @@ MultiExecParallelHash(HashState *node)
 	hashtable->nbuckets = pstate->nbuckets;
 	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
 	hashtable->totalTuples = pstate->total_tuples;
-	ExecParallelHashEnsureBatchAccessors(hashtable);
+
+	/*
+	 * Unless we're completely done and the batch state has been freed, make
+	 * sure we have accessors.
+	 */
+	if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
+		ExecParallelHashEnsureBatchAccessors(hashtable);
 
 	/*
 	 * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
-	 * case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
+	 * case, which will bring the build phase to PHJ_BUILD_RUNNING (if it isn't
 	 * there already).
 	 */
 	Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+		   BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
 		   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 }
 
@@ -632,7 +639,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 		/*
 		 * The next Parallel Hash synchronization point is in
 		 * MultiExecParallelHash(), which will progress it all the way to
-		 * PHJ_BUILD_DONE.  The caller must not return control from this
+		 * PHJ_BUILD_RUNNING.  The caller must not return control from this
 		 * executor node between now and then.
 		 */
 	}
@@ -3056,14 +3063,11 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
 	}
 
 	/*
-	 * It's possible for a backend to start up very late so that the whole
-	 * join is finished and the shm state for tracking batches has already
-	 * been freed by ExecHashTableDetach().  In that case we'll just leave
-	 * hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
-	 * up early.
+	 * We should never see a state where the batch-tracking array is freed,
+	 * because we should have given up sooner if we join when the build barrier
+	 * has reached the PHJ_BUILD_DONE phase.
 	 */
-	if (!DsaPointerIsValid(pstate->batches))
-		return;
+	Assert(DsaPointerIsValid(pstate->batches));
 
 	/* Use hash join memory context. */
 	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -3183,9 +3187,17 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
 void
 ExecHashTableDetach(HashJoinTable hashtable)
 {
-	if (hashtable->parallel_state)
+	ParallelHashJoinState *pstate = hashtable->parallel_state;
+
+	/*
+	 * If we're involved in a parallel query, we must either have got all the
+	 * way to PHJ_BUILD_RUNNING, or joined too late and be in PHJ_BUILD_DONE.
+	 */
+	Assert(!pstate ||
+		   BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
+
+	if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
 	{
-		ParallelHashJoinState *pstate = hashtable->parallel_state;
 		int			i;
 
 		/* Make sure any temporary files are closed. */
@@ -3201,8 +3213,14 @@ ExecHashTableDetach(HashJoinTable hashtable)
 		}
 
 		/* If we're last to detach, clean up shared memory. */
-		if (BarrierDetach(&pstate->build_barrier))
+		if (BarrierArriveAndDetach(&pstate->build_barrier))
 		{
+			/*
+			 * Late joining processes will see this state and give up
+			 * immediately.
+			 */
+			Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
+
 			if (DsaPointerIsValid(pstate->batches))
 			{
 				dsa_free(hashtable->area, pstate->batches);
@@ -3212,9 +3230,8 @@ ExecHashTableDetach(HashJoinTable hashtable)
 				pstate->batches = InvalidDsaPointer;
 			}
 		}
-
-		hashtable->parallel_state = NULL;
 	}
+	hashtable->parallel_state = NULL;
 }
 
 /*
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 5532b91a71..b996557ac4 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -45,7 +45,8 @@
  *   PHJ_BUILD_ALLOCATING            -- one sets up the batches and table 0
  *   PHJ_BUILD_HASHING_INNER         -- all hash the inner rel
  *   PHJ_BUILD_HASHING_OUTER         -- (multi-batch only) all hash the outer
- *   PHJ_BUILD_DONE                  -- building done, probing can begin
+ *   PHJ_BUILD_RUNNING               -- building done, probing can begin
+ *   PHJ_BUILD_DONE                  -- all work complete, one frees batches
  *
  * While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
  * be used repeatedly as required to coordinate expansions in the number of
@@ -73,7 +74,7 @@
  * batches whenever it encounters them while scanning and probing, which it
  * can do because it processes batches in serial order.
  *
- * Once PHJ_BUILD_DONE is reached, backends then split up and process
+ * Once PHJ_BUILD_RUNNING is reached, backends then split up and process
  * different batches, or gang up and work together on probing batches if there
  * aren't enough to go around.  For each batch there is a separate barrier
  * with the following phases:
@@ -95,11 +96,16 @@
  *
  * To avoid deadlocks, we never wait for any barrier unless it is known that
  * all other backends attached to it are actively executing the node or have
- * already arrived.  Practically, that means that we never return a tuple
- * while attached to a barrier, unless the barrier has reached its final
- * state.  In the slightly special case of the per-batch barrier, we return
- * tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
- * BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
+ * finished.  Practically, that means that we never emit a tuple while attached
+ * to a barrier, unless the barrier has reached a phase that means that no
+ * process will wait on it again.  We emit tuples while attached to the build
+ * barrier in phase PHJ_BUILD_RUNNING, and to a per-batch barrier in phase
+ * PHJ_BATCH_PROBING.  These are advanced to PHJ_BUILD_DONE and PHJ_BATCH_DONE
+ * respectively without waiting, using BarrierArriveAndDetach().  The last to
+ * detach receives a different return value so that it knows that it's safe to
+ * clean up.  Any straggler process that attaches after that phase is reached
+ * will see that it's too late to participate or access the relevant shared
+ * memory objects.
  *
  *-------------------------------------------------------------------------
  */
@@ -317,6 +323,7 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 
 					build_barrier = &parallel_state->build_barrier;
 					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
+						   BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
 						   BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 					if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
 					{
@@ -328,10 +335,17 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
 							ExecParallelHashJoinPartitionOuter(node);
 						BarrierArriveAndWait(build_barrier,
 											 WAIT_EVENT_HASH_BUILD_HASH_OUTER);
+					} else if (BarrierPhase(build_barrier) == PHJ_BUILD_DONE) {
+						/*
+						 * If we attached so late that the job is finished
+						 * and the batch state has been freed, we can return
+						 * immediately.
+						 */
+						return NULL;
 					}
-					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
 
 					/* Each backend should now select a batch to work on. */
+					Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING);
 					hashtable->curbatch = -1;
 					node->hj_JoinState = HJ_NEED_NEW_BATCH;
 
@@ -1090,14 +1104,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 	int			start_batchno;
 	int			batchno;
 
-	/*
-	 * If we started up so late that the batch tracking array has been freed
-	 * already by ExecHashTableDetach(), then we are finished.  See also
-	 * ExecParallelHashEnsureBatchAccessors().
-	 */
-	if (hashtable->batches == NULL)
-		return false;
-
 	/*
 	 * If we were already attached to a batch, remember not to bother checking
 	 * it again, and detach from it (possibly freeing the hash table if we are
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index eb5daba36b..443ba6eb9f 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -258,7 +258,8 @@ typedef struct ParallelHashJoinState
 #define PHJ_BUILD_ALLOCATING			1
 #define PHJ_BUILD_HASHING_INNER			2
 #define PHJ_BUILD_HASHING_OUTER			3
-#define PHJ_BUILD_DONE					4
+#define PHJ_BUILD_RUNNING				4
+#define PHJ_BUILD_DONE					5
 
 /* The phases for probing each batch, used by for batch_barrier. */
 #define PHJ_BATCH_ELECTING				0
-- 
2.20.1

Reply via email to