On Fri, Sep 25, 2020 at 12:05 PM Tom Lane <t...@sss.pgh.pa.us> wrote: > Thomas Munro <thomas.mu...@gmail.com> writes: > > Tom, do you have any thoughts on ShutdownCLOG() etc? > > Hm, if we cannot reach that without first completing a shutdown checkpoint, > it does seem a little pointless.
Thanks for the sanity check. > It'd likely be a good idea to add a comment to CheckPointCLOG et al > explaining that we expect $what-exactly to fsync the data we are writing > before the checkpoint is considered done. Good point. Done like this: + /* + * Write dirty CLOG pages to disk. This may result in sync requests queued + * for later handling by ProcessSyncRequests(), as part of the checkpoint. + */ TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); - SimpleLruFlush(XactCtl, true); + SimpleLruWriteAll(XactCtl, true); TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); Here's a new version. The final thing I'm contemplating before pushing this is whether there may be hidden magical dependencies in the order of operations in CheckPointGuts(), which I've changed around. Andres, any comments?
From a722d71c4296e5fae778a783eb8140be7887eced Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Fri, 25 Sep 2020 11:16:03 +1200 Subject: [PATCH v8] Defer flushing of SLRU files. Previously, we called fsync() after writing out individual pg_xact, pg_multixact and pg_commit_ts pages due to cache pressure, leading to regular I/O stalls in user backends and recovery. Collapse requests for the same file into a single system call as part of the next checkpoint, as we already did for relation files, using the infrastructure developed by commit 3eb77eba. This causes a significant improvement to recovery performance. Remove the Shutdown{CLOG,CommitTS,SUBTRANS,MultiXact}() functions, because they were redundant after the shutdown checkpoint that immediately precedes them. Hoist ProcessSyncRequests() up into CheckPointGuts() to make it clearer that it applies to all the SLRU mini-buffer-pools as well as the main buffer pool. Also rearrange things so that data collected in CheckpointStats includes SLRU activity. Reviewed-by: Tom Lane <t...@sss.pgh.pa.us> (ShutdownXXX removal part) Tested-by: Jakub Wartak <jakub.war...@tomtom.com> Discussion: https://postgr.es/m/CA+hUKGLJ=84yt+nvhkeedauutvhmfq9i-n7k_o50jmq6rpj...@mail.gmail.com --- src/backend/access/transam/clog.c | 39 +++---- src/backend/access/transam/commit_ts.c | 36 +++--- src/backend/access/transam/multixact.c | 57 +++++---- src/backend/access/transam/slru.c | 154 +++++++++++++++++-------- src/backend/access/transam/subtrans.c | 25 +--- src/backend/access/transam/xlog.c | 18 ++- src/backend/commands/async.c | 5 +- src/backend/storage/buffer/bufmgr.c | 7 -- src/backend/storage/lmgr/predicate.c | 8 +- src/backend/storage/sync/sync.c | 28 ++++- src/include/access/clog.h | 3 + src/include/access/commit_ts.h | 3 + src/include/access/multixact.h | 4 + src/include/access/slru.h | 14 ++- src/include/storage/sync.h | 7 +- src/tools/pgindent/typedefs.list | 1 + 16 files changed, 244 insertions(+), 165 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 9e352d2658..2ec6a924db 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -42,6 +42,7 @@ #include "pg_trace.h" #include "pgstat.h" #include "storage/proc.h" +#include "storage/sync.h" /* * Defines for CLOG page sizes. A page is the same BLCKSZ as is used @@ -691,7 +692,8 @@ CLOGShmemInit(void) { XactCtl->PagePrecedes = CLOGPagePrecedes; SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, - XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER); + XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, + SYNC_HANDLER_CLOG); } /* @@ -808,34 +810,18 @@ TrimCLOG(void) LWLockRelease(XactSLRULock); } -/* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownCLOG(void) -{ - /* Flush dirty CLOG pages to disk */ - TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false); - SimpleLruFlush(XactCtl, false); - - /* - * fsync pg_xact to ensure that any files flushed previously are durably - * on disk. - */ - fsync_fname("pg_xact", true); - - TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false); -} - /* * Perform a checkpoint --- either during shutdown, or on-the-fly */ void CheckPointCLOG(void) { - /* Flush dirty CLOG pages to disk */ + /* + * Write dirty CLOG pages to disk. This may result in sync requests queued + * for later handling by ProcessSyncRequests(), as part of the checkpoint. + */ TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); - SimpleLruFlush(XactCtl, true); + SimpleLruWriteAll(XactCtl, true); TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); } @@ -1026,3 +1012,12 @@ clog_redo(XLogReaderState *record) else elog(PANIC, "clog_redo: unknown op code %u", info); } + +/* + * Entrypoint for sync.c to sync clog files. + */ +int +clogsyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(XactCtl, ftag, path); +} diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index f6a7329ba3..cb8a968801 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -555,7 +555,8 @@ CommitTsShmemInit(void) CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, CommitTsSLRULock, "pg_commit_ts", - LWTRANCHE_COMMITTS_BUFFER); + LWTRANCHE_COMMITTS_BUFFER, + SYNC_HANDLER_COMMIT_TS); commitTsShared = ShmemInitStruct("CommitTs shared", sizeof(CommitTimestampShared), @@ -798,30 +799,18 @@ DeactivateCommitTs(void) LWLockRelease(CommitTsSLRULock); } -/* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownCommitTs(void) -{ - /* Flush dirty CommitTs pages to disk */ - SimpleLruFlush(CommitTsCtl, false); - - /* - * fsync pg_commit_ts to ensure that any files flushed previously are - * durably on disk. - */ - fsync_fname("pg_commit_ts", true); -} - /* * Perform a checkpoint --- either during shutdown, or on-the-fly */ void CheckPointCommitTs(void) { - /* Flush dirty CommitTs pages to disk */ - SimpleLruFlush(CommitTsCtl, true); + /* + * Write dirty CommitTs pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(CommitTsCtl, true); } /* @@ -1077,3 +1066,12 @@ commit_ts_redo(XLogReaderState *record) else elog(PANIC, "commit_ts_redo: unknown op code %u", info); } + +/* + * Entrypoint for sync.c to sync commit_ts files. + */ +int +committssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(CommitTsCtl, ftag, path); +} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index b8bedca04a..a2ce617c8c 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1831,11 +1831,13 @@ MultiXactShmemInit(void) SimpleLruInit(MultiXactOffsetCtl, "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0, MultiXactOffsetSLRULock, "pg_multixact/offsets", - LWTRANCHE_MULTIXACTOFFSET_BUFFER); + LWTRANCHE_MULTIXACTOFFSET_BUFFER, + SYNC_HANDLER_MULTIXACT_OFFSET); SimpleLruInit(MultiXactMemberCtl, "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0, MultiXactMemberSLRULock, "pg_multixact/members", - LWTRANCHE_MULTIXACTMEMBER_BUFFER); + LWTRANCHE_MULTIXACTMEMBER_BUFFER, + SYNC_HANDLER_MULTIXACT_MEMBER); /* Initialize our shared state struct */ MultiXactState = ShmemInitStruct("Shared MultiXact State", @@ -2100,19 +2102,6 @@ TrimMultiXact(void) SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); } -/* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownMultiXact(void) -{ - /* Flush dirty MultiXact pages to disk */ - TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false); - SimpleLruFlush(MultiXactOffsetCtl, false); - SimpleLruFlush(MultiXactMemberCtl, false); - TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false); -} - /* * Get the MultiXact data to save in a checkpoint record */ @@ -2143,9 +2132,13 @@ CheckPointMultiXact(void) { TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); - /* Flush dirty MultiXact pages to disk */ - SimpleLruFlush(MultiXactOffsetCtl, true); - SimpleLruFlush(MultiXactMemberCtl, true); + /* + * Write dirty MultiXact pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); } @@ -2728,14 +2721,10 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) entryno = MultiXactIdToOffsetEntry(multi); /* - * Flush out dirty data, so PhysicalPageExists can work correctly. - * SimpleLruFlush() is a pretty big hammer for that. Alternatively we - * could add an in-memory version of page exists, but find_multixact_start - * is called infrequently, and it doesn't seem bad to flush buffers to - * disk before truncation. + * Write out dirty data, so PhysicalPageExists can work correctly. */ - SimpleLruFlush(MultiXactOffsetCtl, true); - SimpleLruFlush(MultiXactMemberCtl, true); + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) return false; @@ -3386,3 +3375,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funccxt); } + +/* + * Entrypoint for sync.c to sync offsets files. + */ +int +multixactoffsetssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path); +} + +/* + * Entrypoint for sync.c to sync members files. + */ +int +multixactmemberssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactMemberCtl, ftag, path); +} diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index fe7d759a8c..636b90db87 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -63,22 +63,33 @@ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) /* - * During SimpleLruFlush(), we will usually not need to write/fsync more - * than one or two physical files, but we may need to write several pages - * per file. We can consolidate the I/O requests by leaving files open - * until control returns to SimpleLruFlush(). This data structure remembers - * which files are open. + * During SimpleLruWriteAll(), we will usually not need to write more than one + * or two physical files, but we may need to write several pages per file. We + * can consolidate the I/O requests by leaving files open until control returns + * to SimpleLruWriteAll(). This data structure remembers which files are open. */ -#define MAX_FLUSH_BUFFERS 16 +#define MAX_WRITEALL_BUFFERS 16 -typedef struct SlruFlushData +typedef struct SlruWriteAllData { int num_files; /* # files actually open */ - int fd[MAX_FLUSH_BUFFERS]; /* their FD's */ - int segno[MAX_FLUSH_BUFFERS]; /* their log seg#s */ -} SlruFlushData; + int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */ + int segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */ +} SlruWriteAllData; -typedef struct SlruFlushData *SlruFlush; +typedef struct SlruWriteAllData *SlruWriteAll; + +/* + * Populate a file tag describing a segment file. We only use the segment + * number, since we can derive everything else we need by having separate + * sync handler functions for clog, multixact etc. + */ +#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = (xx_handler), \ + (a).segno = (xx_segno) \ +) /* * Macro to mark a buffer slot "most recently used". Note multiple evaluation @@ -125,10 +136,10 @@ static int slru_errno; static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); static void SimpleLruWaitIO(SlruCtl ctl, int slotno); -static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata); +static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata); static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, - SlruFlush fdata); + SlruWriteAll fdata); static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); static int SlruSelectLRUPage(SlruCtl ctl, int pageno); @@ -173,7 +184,8 @@ SimpleLruShmemSize(int nslots, int nlsns) */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id) + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler) { SlruShared shared; bool found; @@ -251,7 +263,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, * assume caller set PagePrecedes. */ ctl->shared = shared; - ctl->do_fsync = true; /* default behavior */ + ctl->sync_handler = sync_handler; strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); } @@ -523,7 +535,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) * Control lock must be held at entry, and will be held at exit. */ static void -SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) +SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) { SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; @@ -587,6 +599,10 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); + + /* If part of a checkpoint, count this as a buffer written. */ + if (fdata) + CheckpointStats.ckpt_bufs_written++; } /* @@ -730,13 +746,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during - * SimpleLruFlush, though. + * SimpleLruWriteAll, though. * * fdata is NULL for a standalone write, pointer to open-file info during - * SimpleLruFlush. + * SimpleLruWriteAll. */ static bool -SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) +SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; @@ -791,7 +807,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) } /* - * During a Flush, we may already have the desired file open. + * During a WriteAll, we may already have the desired file open. */ if (fdata) { @@ -837,7 +853,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) if (fdata) { - if (fdata->num_files < MAX_FLUSH_BUFFERS) + if (fdata->num_files < MAX_WRITEALL_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; @@ -870,23 +886,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) } pgstat_report_wait_end(); - /* - * If not part of Flush, need to fsync now. We assume this happens - * infrequently enough that it's not a performance issue. - */ - if (!fdata) + /* Queue up a sync request for the checkpointer. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) { - pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); - if (ctl->do_fsync && pg_fsync(fd) != 0) + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) { + /* No space to enqueue sync request. Do it synchronously. */ + pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); + if (pg_fsync(fd) != 0) + { + pgstat_report_wait_end(); + slru_errcause = SLRU_FSYNC_FAILED; + slru_errno = errno; + CloseTransientFile(fd); + return false; + } pgstat_report_wait_end(); - slru_errcause = SLRU_FSYNC_FAILED; - slru_errno = errno; - CloseTransientFile(fd); - return false; } - pgstat_report_wait_end(); + } + /* Close file, unless part of flush request. */ + if (!fdata) + { if (CloseTransientFile(fd) != 0) { slru_errcause = SLRU_CLOSE_FAILED; @@ -1122,13 +1146,16 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) } /* - * Flush dirty pages to disk during checkpoint or database shutdown + * Write dirty pages to disk during checkpoint or database shutdown. Flushing + * is deferred until the next call to ProcessSyncRequests(), though we do fsync + * the containing directory here to make sure that newly created directory + * entries are on disk. */ void -SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) +SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) { SlruShared shared = ctl->shared; - SlruFlushData fdata; + SlruWriteAllData fdata; int slotno; int pageno = 0; int i; @@ -1162,21 +1189,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) LWLockRelease(shared->ControlLock); /* - * Now fsync and close any files that were open + * Now close any files that were open */ ok = true; for (i = 0; i < fdata.num_files; i++) { - pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC); - if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0) - { - slru_errcause = SLRU_FSYNC_FAILED; - slru_errno = errno; - pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; - ok = false; - } - pgstat_report_wait_end(); - if (CloseTransientFile(fdata.fd[i]) != 0) { slru_errcause = SLRU_CLOSE_FAILED; @@ -1189,7 +1206,7 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) SlruReportIOError(ctl, pageno, InvalidTransactionId); /* Ensure that directory entries for new files are on disk. */ - if (ctl->do_fsync) + if (ctl->sync_handler != SYNC_HANDLER_NONE) fsync_fname(ctl->Dir, true); } @@ -1350,6 +1367,19 @@ restart: snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno); ereport(DEBUG2, (errmsg("removing file \"%s\"", path))); + + /* + * Tell the checkpointer to forget any sync requests, before we unlink the + * file. + */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + { + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true); + } + unlink(path); LWLockRelease(shared->ControlLock); @@ -1448,3 +1478,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) return retval; } + +/* + * Individual SLRUs (clog, ...) have to provide a sync.c handler function so + * that they can provide the correct "SlruCtl" (otherwise we don't know how to + * build the path), but they just forward to this common implementation that + * performs the fsync. + */ +int +SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path) +{ + int fd; + int save_errno; + int result; + + SlruFileName(ctl, path, ftag->segno); + + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + if (fd < 0) + return -1; + + result = pg_fsync(fd); + save_errno = errno; + + CloseTransientFile(fd); + + errno = save_errno; + return result; +} diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index a50f60b99a..0111e867c7 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -193,9 +193,7 @@ SUBTRANSShmemInit(void) SubTransCtl->PagePrecedes = SubTransPagePrecedes; SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0, SubtransSLRULock, "pg_subtrans", - LWTRANCHE_SUBTRANS_BUFFER); - /* Override default assumption that writes should be fsync'd */ - SubTransCtl->do_fsync = false; + LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE); } /* @@ -278,23 +276,6 @@ StartupSUBTRANS(TransactionId oldestActiveXID) LWLockRelease(SubtransSLRULock); } -/* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownSUBTRANS(void) -{ - /* - * Flush dirty SUBTRANS pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely as a debugging aid. - */ - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(false); - SimpleLruFlush(SubTransCtl, false); - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(false); -} - /* * Perform a checkpoint --- either during shutdown, or on-the-fly */ @@ -302,14 +283,14 @@ void CheckPointSUBTRANS(void) { /* - * Flush dirty SUBTRANS pages to disk + * Write dirty SUBTRANS pages to disk * * This is not actually necessary from a correctness point of view. We do * it merely to improve the odds that writing of dirty pages is done by * the checkpoint process and not by backends. */ TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true); - SimpleLruFlush(SubTransCtl, true); + SimpleLruWriteAll(SubTransCtl, true); TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 61754312e2..ed17dd0d12 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8528,10 +8528,6 @@ ShutdownXLOG(int code, Datum arg) CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } - ShutdownCLOG(); - ShutdownCommitTs(); - ShutdownSUBTRANS(); - ShutdownMultiXact(); } /* @@ -9176,16 +9172,28 @@ CreateEndOfRecoveryRecord(void) static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { + /* Write out all dirty data in SLRUs and the main buffer pool */ + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); + CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); CheckPointCLOG(); CheckPointCommitTs(); CheckPointSUBTRANS(); CheckPointMultiXact(); CheckPointPredicate(); + CheckPointBuffers(flags); + + /* Perform all required fsyncs */ + CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); + ProcessSyncRequests(); + CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp(); + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE(); + + /* Other activities performed at checkpoint time */ CheckPointRelationMap(); CheckPointReplicationSlots(); CheckPointSnapBuild(); CheckPointLogicalRewriteHeap(); - CheckPointBuffers(flags); /* performs all required fsyncs */ CheckPointReplicationOrigin(); /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index cb341365df..8dbcace3f9 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -554,9 +554,8 @@ AsyncShmemInit(void) */ NotifyCtl->PagePrecedes = asyncQueuePagePrecedes; SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0, - NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER); - /* Override default assumption that writes should be fsync'd */ - NotifyCtl->do_fsync = false; + NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER, + SYNC_HANDLER_NONE); if (!found) { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index a2a963bd5b..e549fa1d30 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2636,14 +2636,7 @@ PrintBufferLeakWarning(Buffer buffer) void CheckPointBuffers(int flags) { - TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); - CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); BufferSync(flags); - CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); - TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); - ProcessSyncRequests(); - CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp(); - TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE(); } diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index a2f8e7524b..8a365b400c 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -821,9 +821,7 @@ SerialInit(void) SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically; SimpleLruInit(SerialSlruCtl, "Serial", NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial", - LWTRANCHE_SERIAL_BUFFER); - /* Override default assumption that writes should be fsync'd */ - SerialSlruCtl->do_fsync = false; + LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE); /* * Create or attach to the SerialControl structure. @@ -1052,7 +1050,7 @@ CheckPointPredicate(void) SimpleLruTruncate(SerialSlruCtl, tailPage); /* - * Flush dirty SLRU pages to disk + * Write dirty SLRU pages to disk * * This is not actually necessary from a correctness point of view. We do * it merely as a debugging aid. @@ -1061,7 +1059,7 @@ CheckPointPredicate(void) * before deleting the file in which they sit, which would be completely * pointless. */ - SimpleLruFlush(SerialSlruCtl, true); + SimpleLruWriteAll(SerialSlruCtl, true); } /*------------------------------------------------------------------------*/ diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index 3ded2cdd71..1d635d596c 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -18,6 +18,9 @@ #include <fcntl.h> #include <sys/file.h> +#include "access/commit_ts.h" +#include "access/clog.h" +#include "access/multixact.h" #include "access/xlog.h" #include "access/xlogutils.h" #include "commands/tablespace.h" @@ -90,12 +93,31 @@ typedef struct SyncOps const FileTag *candidate); } SyncOps; +/* + * These indexes must correspond to the values of the SyncRequestHandler enum. + */ static const SyncOps syncsw[] = { /* magnetic disk */ - { + [SYNC_HANDLER_MD] = { .sync_syncfiletag = mdsyncfiletag, .sync_unlinkfiletag = mdunlinkfiletag, .sync_filetagmatches = mdfiletagmatches + }, + /* pg_xact */ + [SYNC_HANDLER_CLOG] = { + .sync_syncfiletag = clogsyncfiletag + }, + /* pg_commit_ts */ + [SYNC_HANDLER_COMMIT_TS] = { + .sync_syncfiletag = committssyncfiletag + }, + /* pg_multixact/offsets */ + [SYNC_HANDLER_MULTIXACT_OFFSET] = { + .sync_syncfiletag = multixactoffsetssyncfiletag + }, + /* pg_multixact/members */ + [SYNC_HANDLER_MULTIXACT_MEMBER] = { + .sync_syncfiletag = multixactmemberssyncfiletag } }; @@ -505,8 +527,8 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type) (void *) ftag, HASH_ENTER, &found); - /* if new entry, initialize it */ - if (!found) + /* if new entry, or was previously canceled, initialize it */ + if (!found || entry->canceled) { entry->cycle_ctr = sync_cycle_ctr; entry->canceled = false; diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 2db8acb189..d97b9042dc 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -12,6 +12,7 @@ #define CLOG_H #include "access/xlogreader.h" +#include "storage/sync.h" #include "lib/stringinfo.h" /* @@ -50,6 +51,8 @@ extern void CheckPointCLOG(void); extern void ExtendCLOG(TransactionId newestXact); extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid); +extern int clogsyncfiletag(const FileTag *ftag, char *path); + /* XLOG stuff */ #define CLOG_ZEROPAGE 0x00 #define CLOG_TRUNCATE 0x10 diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h index 2740c02a84..27900ce430 100644 --- a/src/include/access/commit_ts.h +++ b/src/include/access/commit_ts.h @@ -14,6 +14,7 @@ #include "access/xlog.h" #include "datatype/timestamp.h" #include "replication/origin.h" +#include "storage/sync.h" #include "utils/guc.h" @@ -45,6 +46,8 @@ extern void SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact); extern void AdvanceOldestCommitTsXid(TransactionId oldestXact); +extern int committssyncfiletag(const FileTag *ftag, char *path); + /* XLOG stuff */ #define COMMIT_TS_ZEROPAGE 0x00 #define COMMIT_TS_TRUNCATE 0x10 diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 6d729008c6..71d6e78063 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -13,6 +13,7 @@ #include "access/xlogreader.h" #include "lib/stringinfo.h" +#include "storage/sync.h" /* @@ -116,6 +117,9 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2); +extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path); +extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path); + extern void AtEOXact_MultiXact(void); extern void AtPrepare_MultiXact(void); extern void PostPrepare_MultiXact(TransactionId xid); diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 61fbc80ef0..b39b43504d 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -15,6 +15,7 @@ #include "access/xlogdefs.h" #include "storage/lwlock.h" +#include "storage/sync.h" /* @@ -111,10 +112,10 @@ typedef struct SlruCtlData SlruShared shared; /* - * This flag tells whether to fsync writes (true for pg_xact and multixact - * stuff, false for pg_subtrans and pg_notify). + * Which sync handler function to use when handing sync requests over to + * the checkpointer. SYNC_HANDLER_NONE to disable fsync (eg pg_notify). */ - bool do_fsync; + SyncRequestHandler sync_handler; /* * Decide which of two page numbers is "older" for truncation purposes. We @@ -135,14 +136,15 @@ typedef SlruCtlData *SlruCtl; extern Size SimpleLruShmemSize(int nslots, int nlsns); extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id); + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler); extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, TransactionId xid); extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid); extern void SimpleLruWritePage(SlruCtl ctl, int slotno); -extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied); +extern void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno); @@ -151,6 +153,8 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); extern void SlruDeleteSegment(SlruCtl ctl, int segno); +extern int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path); + /* SlruScanDirectory public callbacks */ extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data); diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h index e16ab8e711..f32e412e75 100644 --- a/src/include/storage/sync.h +++ b/src/include/storage/sync.h @@ -34,7 +34,12 @@ typedef enum SyncRequestType */ typedef enum SyncRequestHandler { - SYNC_HANDLER_MD = 0 /* md smgr */ + SYNC_HANDLER_MD = 0, + SYNC_HANDLER_CLOG, + SYNC_HANDLER_COMMIT_TS, + SYNC_HANDLER_MULTIXACT_OFFSET, + SYNC_HANDLER_MULTIXACT_MEMBER, + SYNC_HANDLER_NONE } SyncRequestHandler; /* diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b1afb345c3..88fc3ceae4 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2404,6 +2404,7 @@ Syn SyncOps SyncRepConfigData SyncRepStandbyData +SyncRequestHandler SyncRequestType SysScanDesc SyscacheCallbackFunction -- 2.20.1