On Fri, Sep 25, 2020 at 12:05 PM Tom Lane <t...@sss.pgh.pa.us> wrote:
> Thomas Munro <thomas.mu...@gmail.com> writes:
> > Tom, do you have any thoughts on ShutdownCLOG() etc?
>
> Hm, if we cannot reach that without first completing a shutdown checkpoint,
> it does seem a little pointless.

Thanks for the sanity check.

> It'd likely be a good idea to add a comment to CheckPointCLOG et al
> explaining that we expect $what-exactly to fsync the data we are writing
> before the checkpoint is considered done.

Good point.  Done like this:

+       /*
+        * Write dirty CLOG pages to disk.  This may result in sync
requests queued
+        * for later handling by ProcessSyncRequests(), as part of the
checkpoint.
+        */
        TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
-       SimpleLruFlush(XactCtl, true);
+       SimpleLruWriteAll(XactCtl, true);
        TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);

Here's a new version.  The final thing I'm contemplating before
pushing this is whether there may be hidden magical dependencies in
the order of operations in CheckPointGuts(), which I've changed
around.  Andres, any comments?
From a722d71c4296e5fae778a783eb8140be7887eced Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 25 Sep 2020 11:16:03 +1200
Subject: [PATCH v8] Defer flushing of SLRU files.

Previously, we called fsync() after writing out individual pg_xact,
pg_multixact and pg_commit_ts pages due to cache pressure, leading to
regular I/O stalls in user backends and recovery.  Collapse requests for
the same file into a single system call as part of the next checkpoint,
as we already did for relation files, using the infrastructure developed
by commit 3eb77eba.  This causes a significant improvement to recovery
performance.

Remove the Shutdown{CLOG,CommitTS,SUBTRANS,MultiXact}() functions,
because they were redundant after the shutdown checkpoint that
immediately precedes them.

Hoist ProcessSyncRequests() up into CheckPointGuts() to make it clearer
that it applies to all the SLRU mini-buffer-pools as well as the main
buffer pool.  Also rearrange things so that data collected in
CheckpointStats includes SLRU activity.

Reviewed-by: Tom Lane <t...@sss.pgh.pa.us> (ShutdownXXX removal part)
Tested-by: Jakub Wartak <jakub.war...@tomtom.com>
Discussion: https://postgr.es/m/CA+hUKGLJ=84yt+nvhkeedauutvhmfq9i-n7k_o50jmq6rpj...@mail.gmail.com
---
 src/backend/access/transam/clog.c      |  39 +++----
 src/backend/access/transam/commit_ts.c |  36 +++---
 src/backend/access/transam/multixact.c |  57 +++++----
 src/backend/access/transam/slru.c      | 154 +++++++++++++++++--------
 src/backend/access/transam/subtrans.c  |  25 +---
 src/backend/access/transam/xlog.c      |  18 ++-
 src/backend/commands/async.c           |   5 +-
 src/backend/storage/buffer/bufmgr.c    |   7 --
 src/backend/storage/lmgr/predicate.c   |   8 +-
 src/backend/storage/sync/sync.c        |  28 ++++-
 src/include/access/clog.h              |   3 +
 src/include/access/commit_ts.h         |   3 +
 src/include/access/multixact.h         |   4 +
 src/include/access/slru.h              |  14 ++-
 src/include/storage/sync.h             |   7 +-
 src/tools/pgindent/typedefs.list       |   1 +
 16 files changed, 244 insertions(+), 165 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 9e352d2658..2ec6a924db 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -42,6 +42,7 @@
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "storage/proc.h"
+#include "storage/sync.h"
 
 /*
  * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
@@ -691,7 +692,8 @@ CLOGShmemInit(void)
 {
 	XactCtl->PagePrecedes = CLOGPagePrecedes;
 	SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
-				  XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER);
+				  XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
+				  SYNC_HANDLER_CLOG);
 }
 
 /*
@@ -808,34 +810,18 @@ TrimCLOG(void)
 	LWLockRelease(XactSLRULock);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownCLOG(void)
-{
-	/* Flush dirty CLOG pages to disk */
-	TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false);
-	SimpleLruFlush(XactCtl, false);
-
-	/*
-	 * fsync pg_xact to ensure that any files flushed previously are durably
-	 * on disk.
-	 */
-	fsync_fname("pg_xact", true);
-
-	TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false);
-}
-
 /*
  * Perform a checkpoint --- either during shutdown, or on-the-fly
  */
 void
 CheckPointCLOG(void)
 {
-	/* Flush dirty CLOG pages to disk */
+	/*
+	 * Write dirty CLOG pages to disk.  This may result in sync requests queued
+	 * for later handling by ProcessSyncRequests(), as part of the checkpoint.
+	 */
 	TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
-	SimpleLruFlush(XactCtl, true);
+	SimpleLruWriteAll(XactCtl, true);
 	TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
 }
 
@@ -1026,3 +1012,12 @@ clog_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "clog_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync clog files.
+ */
+int
+clogsyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(XactCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index f6a7329ba3..cb8a968801 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -555,7 +555,8 @@ CommitTsShmemInit(void)
 	CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
 	SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
 				  CommitTsSLRULock, "pg_commit_ts",
-				  LWTRANCHE_COMMITTS_BUFFER);
+				  LWTRANCHE_COMMITTS_BUFFER,
+				  SYNC_HANDLER_COMMIT_TS);
 
 	commitTsShared = ShmemInitStruct("CommitTs shared",
 									 sizeof(CommitTimestampShared),
@@ -798,30 +799,18 @@ DeactivateCommitTs(void)
 	LWLockRelease(CommitTsSLRULock);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownCommitTs(void)
-{
-	/* Flush dirty CommitTs pages to disk */
-	SimpleLruFlush(CommitTsCtl, false);
-
-	/*
-	 * fsync pg_commit_ts to ensure that any files flushed previously are
-	 * durably on disk.
-	 */
-	fsync_fname("pg_commit_ts", true);
-}
-
 /*
  * Perform a checkpoint --- either during shutdown, or on-the-fly
  */
 void
 CheckPointCommitTs(void)
 {
-	/* Flush dirty CommitTs pages to disk */
-	SimpleLruFlush(CommitTsCtl, true);
+	/*
+	 * Write dirty CommitTs pages to disk.  This may result in sync requests
+	 * queued for later handling by ProcessSyncRequests(), as part of the
+	 * checkpoint.
+	 */
+	SimpleLruWriteAll(CommitTsCtl, true);
 }
 
 /*
@@ -1077,3 +1066,12 @@ commit_ts_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "commit_ts_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync commit_ts files.
+ */
+int
+committssyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(CommitTsCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index b8bedca04a..a2ce617c8c 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1831,11 +1831,13 @@ MultiXactShmemInit(void)
 	SimpleLruInit(MultiXactOffsetCtl,
 				  "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
 				  MultiXactOffsetSLRULock, "pg_multixact/offsets",
-				  LWTRANCHE_MULTIXACTOFFSET_BUFFER);
+				  LWTRANCHE_MULTIXACTOFFSET_BUFFER,
+				  SYNC_HANDLER_MULTIXACT_OFFSET);
 	SimpleLruInit(MultiXactMemberCtl,
 				  "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
 				  MultiXactMemberSLRULock, "pg_multixact/members",
-				  LWTRANCHE_MULTIXACTMEMBER_BUFFER);
+				  LWTRANCHE_MULTIXACTMEMBER_BUFFER,
+				  SYNC_HANDLER_MULTIXACT_MEMBER);
 
 	/* Initialize our shared state struct */
 	MultiXactState = ShmemInitStruct("Shared MultiXact State",
@@ -2100,19 +2102,6 @@ TrimMultiXact(void)
 	SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownMultiXact(void)
-{
-	/* Flush dirty MultiXact pages to disk */
-	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false);
-	SimpleLruFlush(MultiXactOffsetCtl, false);
-	SimpleLruFlush(MultiXactMemberCtl, false);
-	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false);
-}
-
 /*
  * Get the MultiXact data to save in a checkpoint record
  */
@@ -2143,9 +2132,13 @@ CheckPointMultiXact(void)
 {
 	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
 
-	/* Flush dirty MultiXact pages to disk */
-	SimpleLruFlush(MultiXactOffsetCtl, true);
-	SimpleLruFlush(MultiXactMemberCtl, true);
+	/*
+	 * Write dirty MultiXact pages to disk.  This may result in sync requests
+	 * queued for later handling by ProcessSyncRequests(), as part of the
+	 * checkpoint.
+	 */
+	SimpleLruWriteAll(MultiXactOffsetCtl, true);
+	SimpleLruWriteAll(MultiXactMemberCtl, true);
 
 	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
 }
@@ -2728,14 +2721,10 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
 	entryno = MultiXactIdToOffsetEntry(multi);
 
 	/*
-	 * Flush out dirty data, so PhysicalPageExists can work correctly.
-	 * SimpleLruFlush() is a pretty big hammer for that.  Alternatively we
-	 * could add an in-memory version of page exists, but find_multixact_start
-	 * is called infrequently, and it doesn't seem bad to flush buffers to
-	 * disk before truncation.
+	 * Write out dirty data, so PhysicalPageExists can work correctly.
 	 */
-	SimpleLruFlush(MultiXactOffsetCtl, true);
-	SimpleLruFlush(MultiXactMemberCtl, true);
+	SimpleLruWriteAll(MultiXactOffsetCtl, true);
+	SimpleLruWriteAll(MultiXactMemberCtl, true);
 
 	if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
 		return false;
@@ -3386,3 +3375,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS)
 
 	SRF_RETURN_DONE(funccxt);
 }
+
+/*
+ * Entrypoint for sync.c to sync offsets files.
+ */
+int
+multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+multixactmemberssyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index fe7d759a8c..636b90db87 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -63,22 +63,33 @@
 	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
 
 /*
- * During SimpleLruFlush(), we will usually not need to write/fsync more
- * than one or two physical files, but we may need to write several pages
- * per file.  We can consolidate the I/O requests by leaving files open
- * until control returns to SimpleLruFlush().  This data structure remembers
- * which files are open.
+ * During SimpleLruWriteAll(), we will usually not need to write more than one
+ * or two physical files, but we may need to write several pages per file.  We
+ * can consolidate the I/O requests by leaving files open until control returns
+ * to SimpleLruWriteAll().  This data structure remembers which files are open.
  */
-#define MAX_FLUSH_BUFFERS	16
+#define MAX_WRITEALL_BUFFERS	16
 
-typedef struct SlruFlushData
+typedef struct SlruWriteAllData
 {
 	int			num_files;		/* # files actually open */
-	int			fd[MAX_FLUSH_BUFFERS];	/* their FD's */
-	int			segno[MAX_FLUSH_BUFFERS];	/* their log seg#s */
-} SlruFlushData;
+	int			fd[MAX_WRITEALL_BUFFERS];		/* their FD's */
+	int			segno[MAX_WRITEALL_BUFFERS];	/* their log seg#s */
+} SlruWriteAllData;
 
-typedef struct SlruFlushData *SlruFlush;
+typedef struct SlruWriteAllData *SlruWriteAll;
+
+/*
+ * Populate a file tag describing a segment file.  We only use the segment
+ * number, since we can derive everything else we need by having separate
+ * sync handler functions for clog, multixact etc.
+ */
+#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = (xx_handler), \
+	(a).segno = (xx_segno) \
+)
 
 /*
  * Macro to mark a buffer slot "most recently used".  Note multiple evaluation
@@ -125,10 +136,10 @@ static int	slru_errno;
 
 static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
 static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
-static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
+static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
 static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
 static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
-								  SlruFlush fdata);
+								  SlruWriteAll fdata);
 static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
 static int	SlruSelectLRUPage(SlruCtl ctl, int pageno);
 
@@ -173,7 +184,8 @@ SimpleLruShmemSize(int nslots, int nlsns)
  */
 void
 SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-			  LWLock *ctllock, const char *subdir, int tranche_id)
+			  LWLock *ctllock, const char *subdir, int tranche_id,
+			  SyncRequestHandler sync_handler)
 {
 	SlruShared	shared;
 	bool		found;
@@ -251,7 +263,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 	 * assume caller set PagePrecedes.
 	 */
 	ctl->shared = shared;
-	ctl->do_fsync = true;		/* default behavior */
+	ctl->sync_handler = sync_handler;
 	strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
 }
 
@@ -523,7 +535,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
  * Control lock must be held at entry, and will be held at exit.
  */
 static void
-SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
+SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
 {
 	SlruShared	shared = ctl->shared;
 	int			pageno = shared->page_number[slotno];
@@ -587,6 +599,10 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 	/* Now it's okay to ereport if we failed */
 	if (!ok)
 		SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+	/* If part of a checkpoint, count this as a buffer written. */
+	if (fdata)
+		CheckpointStats.ckpt_bufs_written++;
 }
 
 /*
@@ -730,13 +746,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
  *
  * For now, assume it's not worth keeping a file pointer open across
  * independent read/write operations.  We do batch operations during
- * SimpleLruFlush, though.
+ * SimpleLruWriteAll, though.
  *
  * fdata is NULL for a standalone write, pointer to open-file info during
- * SimpleLruFlush.
+ * SimpleLruWriteAll.
  */
 static bool
-SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
 {
 	SlruShared	shared = ctl->shared;
 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
@@ -791,7 +807,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	}
 
 	/*
-	 * During a Flush, we may already have the desired file open.
+	 * During a WriteAll, we may already have the desired file open.
 	 */
 	if (fdata)
 	{
@@ -837,7 +853,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 
 		if (fdata)
 		{
-			if (fdata->num_files < MAX_FLUSH_BUFFERS)
+			if (fdata->num_files < MAX_WRITEALL_BUFFERS)
 			{
 				fdata->fd[fdata->num_files] = fd;
 				fdata->segno[fdata->num_files] = segno;
@@ -870,23 +886,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	}
 	pgstat_report_wait_end();
 
-	/*
-	 * If not part of Flush, need to fsync now.  We assume this happens
-	 * infrequently enough that it's not a performance issue.
-	 */
-	if (!fdata)
+	/* Queue up a sync request for the checkpointer. */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
 	{
-		pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
-		if (ctl->do_fsync && pg_fsync(fd) != 0)
+		FileTag		tag;
+
+		INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+		if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
 		{
+			/* No space to enqueue sync request.  Do it synchronously. */
+			pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
+			if (pg_fsync(fd) != 0)
+			{
+				pgstat_report_wait_end();
+				slru_errcause = SLRU_FSYNC_FAILED;
+				slru_errno = errno;
+				CloseTransientFile(fd);
+				return false;
+			}
 			pgstat_report_wait_end();
-			slru_errcause = SLRU_FSYNC_FAILED;
-			slru_errno = errno;
-			CloseTransientFile(fd);
-			return false;
 		}
-		pgstat_report_wait_end();
+	}
 
+	/* Close file, unless part of flush request. */
+	if (!fdata)
+	{
 		if (CloseTransientFile(fd) != 0)
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
@@ -1122,13 +1146,16 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 }
 
 /*
- * Flush dirty pages to disk during checkpoint or database shutdown
+ * Write dirty pages to disk during checkpoint or database shutdown.  Flushing
+ * is deferred until the next call to ProcessSyncRequests(), though we do fsync
+ * the containing directory here to make sure that newly created directory
+ * entries are on disk.
  */
 void
-SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
+SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
 {
 	SlruShared	shared = ctl->shared;
-	SlruFlushData fdata;
+	SlruWriteAllData fdata;
 	int			slotno;
 	int			pageno = 0;
 	int			i;
@@ -1162,21 +1189,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 	LWLockRelease(shared->ControlLock);
 
 	/*
-	 * Now fsync and close any files that were open
+	 * Now close any files that were open
 	 */
 	ok = true;
 	for (i = 0; i < fdata.num_files; i++)
 	{
-		pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
-		if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0)
-		{
-			slru_errcause = SLRU_FSYNC_FAILED;
-			slru_errno = errno;
-			pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
-			ok = false;
-		}
-		pgstat_report_wait_end();
-
 		if (CloseTransientFile(fdata.fd[i]) != 0)
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
@@ -1189,7 +1206,7 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 		SlruReportIOError(ctl, pageno, InvalidTransactionId);
 
 	/* Ensure that directory entries for new files are on disk. */
-	if (ctl->do_fsync)
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
 		fsync_fname(ctl->Dir, true);
 }
 
@@ -1350,6 +1367,19 @@ restart:
 	snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
 	ereport(DEBUG2,
 			(errmsg("removing file \"%s\"", path)));
+
+	/*
+	 * Tell the checkpointer to forget any sync requests, before we unlink the
+	 * file.
+	 */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
+	{
+		FileTag		tag;
+
+		INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+		RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
+	}
+
 	unlink(path);
 
 	LWLockRelease(shared->ControlLock);
@@ -1448,3 +1478,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
 
 	return retval;
 }
+
+/*
+ * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
+ * that they can provide the correct "SlruCtl" (otherwise we don't know how to
+ * build the path), but they just forward to this common implementation that
+ * performs the fsync.
+ */
+int
+SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
+{
+	int			fd;
+	int			save_errno;
+	int			result;
+
+	SlruFileName(ctl, path, ftag->segno);
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd < 0)
+		return -1;
+
+	result = pg_fsync(fd);
+	save_errno = errno;
+
+	CloseTransientFile(fd);
+
+	errno = save_errno;
+	return result;
+}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index a50f60b99a..0111e867c7 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -193,9 +193,7 @@ SUBTRANSShmemInit(void)
 	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
 	SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
 				  SubtransSLRULock, "pg_subtrans",
-				  LWTRANCHE_SUBTRANS_BUFFER);
-	/* Override default assumption that writes should be fsync'd */
-	SubTransCtl->do_fsync = false;
+				  LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
 }
 
 /*
@@ -278,23 +276,6 @@ StartupSUBTRANS(TransactionId oldestActiveXID)
 	LWLockRelease(SubtransSLRULock);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownSUBTRANS(void)
-{
-	/*
-	 * Flush dirty SUBTRANS pages to disk
-	 *
-	 * This is not actually necessary from a correctness point of view. We do
-	 * it merely as a debugging aid.
-	 */
-	TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(false);
-	SimpleLruFlush(SubTransCtl, false);
-	TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(false);
-}
-
 /*
  * Perform a checkpoint --- either during shutdown, or on-the-fly
  */
@@ -302,14 +283,14 @@ void
 CheckPointSUBTRANS(void)
 {
 	/*
-	 * Flush dirty SUBTRANS pages to disk
+	 * Write dirty SUBTRANS pages to disk
 	 *
 	 * This is not actually necessary from a correctness point of view. We do
 	 * it merely to improve the odds that writing of dirty pages is done by
 	 * the checkpoint process and not by backends.
 	 */
 	TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true);
-	SimpleLruFlush(SubTransCtl, true);
+	SimpleLruWriteAll(SubTransCtl, true);
 	TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
 }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 61754312e2..ed17dd0d12 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8528,10 +8528,6 @@ ShutdownXLOG(int code, Datum arg)
 
 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
 	}
-	ShutdownCLOG();
-	ShutdownCommitTs();
-	ShutdownSUBTRANS();
-	ShutdownMultiXact();
 }
 
 /*
@@ -9176,16 +9172,28 @@ CreateEndOfRecoveryRecord(void)
 static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
+	/* Write out all dirty data in SLRUs and the main buffer pool */
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
+	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
 	CheckPointCLOG();
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
 	CheckPointPredicate();
+	CheckPointBuffers(flags);
+
+	/* Perform all required fsyncs */
+	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
+	ProcessSyncRequests();
+	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
+
+	/* Other activities performed at checkpoint time */
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
 	CheckPointSnapBuild();
 	CheckPointLogicalRewriteHeap();
-	CheckPointBuffers(flags);	/* performs all required fsyncs */
 	CheckPointReplicationOrigin();
 	/* We deliberately delay 2PC checkpointing as long as possible */
 	CheckPointTwoPhase(checkPointRedo);
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index cb341365df..8dbcace3f9 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -554,9 +554,8 @@ AsyncShmemInit(void)
 	 */
 	NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
 	SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
-				  NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER);
-	/* Override default assumption that writes should be fsync'd */
-	NotifyCtl->do_fsync = false;
+				  NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
+				  SYNC_HANDLER_NONE);
 
 	if (!found)
 	{
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a2a963bd5b..e549fa1d30 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2636,14 +2636,7 @@ PrintBufferLeakWarning(Buffer buffer)
 void
 CheckPointBuffers(int flags)
 {
-	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
-	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
 	BufferSync(flags);
-	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
-	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
-	ProcessSyncRequests();
-	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
-	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
 }
 
 
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index a2f8e7524b..8a365b400c 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -821,9 +821,7 @@ SerialInit(void)
 	SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
 	SimpleLruInit(SerialSlruCtl, "Serial",
 				  NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
-				  LWTRANCHE_SERIAL_BUFFER);
-	/* Override default assumption that writes should be fsync'd */
-	SerialSlruCtl->do_fsync = false;
+				  LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
 
 	/*
 	 * Create or attach to the SerialControl structure.
@@ -1052,7 +1050,7 @@ CheckPointPredicate(void)
 	SimpleLruTruncate(SerialSlruCtl, tailPage);
 
 	/*
-	 * Flush dirty SLRU pages to disk
+	 * Write dirty SLRU pages to disk
 	 *
 	 * This is not actually necessary from a correctness point of view. We do
 	 * it merely as a debugging aid.
@@ -1061,7 +1059,7 @@ CheckPointPredicate(void)
 	 * before deleting the file in which they sit, which would be completely
 	 * pointless.
 	 */
-	SimpleLruFlush(SerialSlruCtl, true);
+	SimpleLruWriteAll(SerialSlruCtl, true);
 }
 
 /*------------------------------------------------------------------------*/
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index 3ded2cdd71..1d635d596c 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -18,6 +18,9 @@
 #include <fcntl.h>
 #include <sys/file.h>
 
+#include "access/commit_ts.h"
+#include "access/clog.h"
+#include "access/multixact.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "commands/tablespace.h"
@@ -90,12 +93,31 @@ typedef struct SyncOps
 										const FileTag *candidate);
 } SyncOps;
 
+/*
+ * These indexes must correspond to the values of the SyncRequestHandler enum.
+ */
 static const SyncOps syncsw[] = {
 	/* magnetic disk */
-	{
+	[SYNC_HANDLER_MD] = {
 		.sync_syncfiletag = mdsyncfiletag,
 		.sync_unlinkfiletag = mdunlinkfiletag,
 		.sync_filetagmatches = mdfiletagmatches
+	},
+	/* pg_xact */
+	[SYNC_HANDLER_CLOG] = {
+		.sync_syncfiletag = clogsyncfiletag
+	},
+	/* pg_commit_ts */
+	[SYNC_HANDLER_COMMIT_TS] = {
+		.sync_syncfiletag = committssyncfiletag
+	},
+	/* pg_multixact/offsets */
+	[SYNC_HANDLER_MULTIXACT_OFFSET] = {
+		.sync_syncfiletag = multixactoffsetssyncfiletag
+	},
+	/* pg_multixact/members */
+	[SYNC_HANDLER_MULTIXACT_MEMBER] = {
+		.sync_syncfiletag = multixactmemberssyncfiletag
 	}
 };
 
@@ -505,8 +527,8 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
 												  (void *) ftag,
 												  HASH_ENTER,
 												  &found);
-		/* if new entry, initialize it */
-		if (!found)
+		/* if new entry, or was previously canceled, initialize it */
+		if (!found || entry->canceled)
 		{
 			entry->cycle_ctr = sync_cycle_ctr;
 			entry->canceled = false;
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index 2db8acb189..d97b9042dc 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -12,6 +12,7 @@
 #define CLOG_H
 
 #include "access/xlogreader.h"
+#include "storage/sync.h"
 #include "lib/stringinfo.h"
 
 /*
@@ -50,6 +51,8 @@ extern void CheckPointCLOG(void);
 extern void ExtendCLOG(TransactionId newestXact);
 extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid);
 
+extern int clogsyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define CLOG_ZEROPAGE		0x00
 #define CLOG_TRUNCATE		0x10
diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h
index 2740c02a84..27900ce430 100644
--- a/src/include/access/commit_ts.h
+++ b/src/include/access/commit_ts.h
@@ -14,6 +14,7 @@
 #include "access/xlog.h"
 #include "datatype/timestamp.h"
 #include "replication/origin.h"
+#include "storage/sync.h"
 #include "utils/guc.h"
 
 
@@ -45,6 +46,8 @@ extern void SetCommitTsLimit(TransactionId oldestXact,
 							 TransactionId newestXact);
 extern void AdvanceOldestCommitTsXid(TransactionId oldestXact);
 
+extern int committssyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define COMMIT_TS_ZEROPAGE		0x00
 #define COMMIT_TS_TRUNCATE		0x10
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 6d729008c6..71d6e78063 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -13,6 +13,7 @@
 
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
+#include "storage/sync.h"
 
 
 /*
@@ -116,6 +117,9 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1,
 										MultiXactId multi2);
 
+extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path);
+extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path);
+
 extern void AtEOXact_MultiXact(void);
 extern void AtPrepare_MultiXact(void);
 extern void PostPrepare_MultiXact(TransactionId xid);
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 61fbc80ef0..b39b43504d 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -15,6 +15,7 @@
 
 #include "access/xlogdefs.h"
 #include "storage/lwlock.h"
+#include "storage/sync.h"
 
 
 /*
@@ -111,10 +112,10 @@ typedef struct SlruCtlData
 	SlruShared	shared;
 
 	/*
-	 * This flag tells whether to fsync writes (true for pg_xact and multixact
-	 * stuff, false for pg_subtrans and pg_notify).
+	 * Which sync handler function to use when handing sync requests over to
+	 * the checkpointer.  SYNC_HANDLER_NONE to disable fsync (eg pg_notify).
 	 */
-	bool		do_fsync;
+	SyncRequestHandler sync_handler;
 
 	/*
 	 * Decide which of two page numbers is "older" for truncation purposes. We
@@ -135,14 +136,15 @@ typedef SlruCtlData *SlruCtl;
 
 extern Size SimpleLruShmemSize(int nslots, int nlsns);
 extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-						  LWLock *ctllock, const char *subdir, int tranche_id);
+						  LWLock *ctllock, const char *subdir, int tranche_id,
+						  SyncRequestHandler sync_handler);
 extern int	SimpleLruZeroPage(SlruCtl ctl, int pageno);
 extern int	SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 							  TransactionId xid);
 extern int	SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
 									   TransactionId xid);
 extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
-extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied);
+extern void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
 extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno);
 
@@ -151,6 +153,8 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
 extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
 extern void SlruDeleteSegment(SlruCtl ctl, int segno);
 
+extern int	SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path);
+
 /* SlruScanDirectory public callbacks */
 extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
 										int segpage, void *data);
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index e16ab8e711..f32e412e75 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -34,7 +34,12 @@ typedef enum SyncRequestType
  */
 typedef enum SyncRequestHandler
 {
-	SYNC_HANDLER_MD = 0			/* md smgr */
+	SYNC_HANDLER_MD = 0,
+	SYNC_HANDLER_CLOG,
+	SYNC_HANDLER_COMMIT_TS,
+	SYNC_HANDLER_MULTIXACT_OFFSET,
+	SYNC_HANDLER_MULTIXACT_MEMBER,
+	SYNC_HANDLER_NONE
 } SyncRequestHandler;
 
 /*
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b1afb345c3..88fc3ceae4 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2404,6 +2404,7 @@ Syn
 SyncOps
 SyncRepConfigData
 SyncRepStandbyData
+SyncRequestHandler
 SyncRequestType
 SysScanDesc
 SyscacheCallbackFunction
-- 
2.20.1

Reply via email to