From 806bee1efdde958bb3d819626e0cfaf624cf2055 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@enterprisedb.com>
Date: Thu, 9 Aug 2018 16:57:57 +0530
Subject: [PATCH 2/4] Fix deadlock in AbsorbAllFsyncRequests().

---
 src/backend/postmaster/checkpointer.c | 54 +++++++++++++--------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 9ef56db97bc..6250cb21946 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -112,6 +112,7 @@ typedef struct
 	ForkNumber	forknum;
 	BlockNumber segno;			/* see md.c for special values */
 	bool		contains_fd;
+	int			ckpt_started;
 	/* might add a real request-type field later; not needed yet */
 } CheckpointerRequest;
 
@@ -169,9 +170,6 @@ static double ckpt_cached_elapsed;
 static pg_time_t last_checkpoint_time;
 static pg_time_t last_xlog_switch_time;
 
-static BlockNumber next_syn_rqst;
-static BlockNumber received_syn_rqst;
-
 /* Prototypes for private functions */
 
 static void CheckArchiveTimeout(void);
@@ -179,7 +177,7 @@ static bool IsCheckpointOnSchedule(double progress);
 static bool ImmediateCheckpointRequested(void);
 static void UpdateSharedMemoryConfig(void);
 static void SendFsyncRequest(CheckpointerRequest *request, int fd);
-static bool AbsorbFsyncRequest(void);
+static bool AbsorbFsyncRequest(bool stop_at_current_cycle);
 
 /* Signal handlers */
 
@@ -1105,6 +1103,11 @@ RequestCheckpoint(int flags)
  * is theoretically possible a backend fsync might still be necessary, if
  * the queue is full and contains no duplicate entries.  In that case, we
  * let the backend know by returning false.
+ *
+ * We add the cycle counter to the message.  That is an unsynchronized read
+ * of the shared memory counter, but it doesn't matter if it is arbitrarily
+ * old since it is only used to limit unnecessary extra queue draining in
+ * AbsorbAllFsyncRequests().
  */
 void
 ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno,
@@ -1124,6 +1127,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno,
 	request.segno = segno;
 	request.contains_fd = file != -1;
 
+	/*
+	 * We read ckpt_started without synchronization.  It is used to prevent
+	 * AbsorbAllFsyncRequests() from reading new values from after a
+	 * checkpoint began.  A slightly out-of-date value here will only cause
+	 * it to do a little bit more work than strictly necessary, but that's
+	 * OK.
+	 */
+	request.ckpt_started = CheckpointerShmem->ckpt_started;
+
 	SendFsyncRequest(&request, request.contains_fd ? FileGetRawDesc(file) : -1);
 }
 
@@ -1152,7 +1164,7 @@ AbsorbFsyncRequests(void)
 		if (!FlushFsyncRequestQueueIfNecessary())
 			break;
 
-		if (!AbsorbFsyncRequest())
+		if (!AbsorbFsyncRequest(false))
 			break;
 	}
 }
@@ -1170,8 +1182,6 @@ AbsorbFsyncRequests(void)
 void
 AbsorbAllFsyncRequests(void)
 {
-	CheckpointerRequest request = {0};
-
 	if (!AmCheckpointerProcess())
 		return;
 
@@ -1181,22 +1191,12 @@ AbsorbAllFsyncRequests(void)
 	BgWriterStats.m_buf_fsync_backend +=
 		pg_atomic_exchange_u32(&CheckpointerShmem->num_backend_fsync, 0);
 
-	/*
-	 * For mdsync()'s guarantees to work, all pending fsync requests need to
-	 * be executed. But we don't want to absorb requests till the queue is
-	 * empty, as that could take a long while.  So instead we enqueue
-	 */
-	request.type = CKPT_REQUEST_SYN;
-	request.segno = ++next_syn_rqst;
-	SendFsyncRequest(&request, -1);
-
-	received_syn_rqst = next_syn_rqst + 1;
-	while (received_syn_rqst != request.segno)
+	for (;;)
 	{
 		if (!FlushFsyncRequestQueueIfNecessary())
 			elog(FATAL, "may not happen");
 
-		if (!AbsorbFsyncRequest())
+		if (!AbsorbFsyncRequest(true))
 			break;
 	}
 }
@@ -1206,7 +1206,7 @@ AbsorbAllFsyncRequests(void)
  *		Retrieve one queued fsync request and pass them to local smgr.
  */
 static bool
-AbsorbFsyncRequest(void)
+AbsorbFsyncRequest(bool stop_at_current_cycle)
 {
 	CheckpointerRequest req;
 	int fd;
@@ -1229,17 +1229,13 @@ AbsorbFsyncRequest(void)
 		elog(FATAL, "message should have fd associated, but doesn't");
 	}
 
-	if (req.type == CKPT_REQUEST_SYN)
-	{
-		received_syn_rqst = req.segno;
-		Assert(fd == -1);
-	}
-	else
-	{
-		RememberFsyncRequest(req.rnode, req.forknum, req.segno, fd);
-	}
+	RememberFsyncRequest(req.rnode, req.forknum, req.segno, fd);
 	END_CRIT_SECTION();
 
+	if (stop_at_current_cycle &&
+		req.ckpt_started == CheckpointerShmem->ckpt_started)
+		return false;
+
 	return true;
 }
 
-- 
2.17.0

