On Tue, Feb 02, 2021 at 07:14:16AM -0800, Noah Misch wrote: > Recycling and preallocation are wasteful during archive recovery, because > KeepFileRestoredFromArchive() unlinks every entry in its path. I propose to > fix the race by adding an XLogCtl flag indicating which regime currently owns > the right to add long-term pg_wal directory entries. In the archive recovery > regime, the checkpointer will not preallocate and will unlink old segments > instead of recycling them (like wal_recycle=off). XLogFileInit() will fail.
Here's the implementation. Patches 1-4 suffice to stop the user-visible ERROR. Patch 5 avoids a spurious LOG-level message and wasted filesystem writes, and it provides some future-proofing. I was tempted to (but did not) just remove preallocation. Creating one file per checkpoint seems tiny relative to the max_wal_size=1GB default, so I expect it's hard to isolate any benefit. Under the old checkpoint_segments=3 default, a preallocated segment covered a respectable third of the next checkpoint. Before commit 63653f7 (2002), preallocation created more files.
Author: Noah Misch <n...@leadboat.com> Commit: Noah Misch <n...@leadboat.com> Remove XLogFileInit() ability to skip ControlFileLock. Cold paths, initdb and end-of-recovery, used it. Don't optimize them. Discussion: https://postgr.es/m/20210202151416.gb3304...@rfd.leadboat.com diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1b3a3d9..39a38ba 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -913,8 +913,7 @@ static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic); static bool XLogCheckpointNeeded(XLogSegNo new_segno); static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible); static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, - bool find_free, XLogSegNo max_segno, - bool use_lock); + bool find_free, XLogSegNo max_segno); static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, XLogSource source, bool notfoundOk); static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); @@ -2492,7 +2491,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) /* create/use new log file */ use_existent = true; - openLogFile = XLogFileInit(openLogSegNo, &use_existent, true); + openLogFile = XLogFileInit(openLogSegNo, &use_existent); ReserveExternalFD(); } @@ -3265,10 +3264,6 @@ XLogNeedsFlush(XLogRecPtr record) * pre-existing file will be deleted). On return, true if a pre-existing * file was used. * - * use_lock: if true, acquire ControlFileLock while moving file into - * place. This should be true except during bootstrap log creation. The - * caller must *not* hold the lock at call. - * * Returns FD of opened file. * * Note: errors here are ERROR not PANIC because we might or might not be @@ -3277,7 +3272,7 @@ XLogNeedsFlush(XLogRecPtr record) * in a critical section. */ int -XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +XLogFileInit(XLogSegNo logsegno, bool *use_existent) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; @@ -3437,8 +3432,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) */ max_segno = logsegno + CheckPointSegments; if (!InstallXLogFileSegment(&installed_segno, tmppath, - *use_existent, max_segno, - use_lock)) + *use_existent, max_segno)) { /* * No need for any more future segments, or InstallXLogFileSegment() @@ -3592,7 +3586,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, /* * Now move the segment into place with its final name. */ - if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false)) + if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0)) elog(ERROR, "InstallXLogFileSegment should not have failed"); } @@ -3616,29 +3610,20 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, * free slot is found between *segno and max_segno. (Ignored when find_free * is false.) * - * use_lock: if true, acquire ControlFileLock while moving file into - * place. This should be true except during bootstrap log creation. The - * caller must *not* hold the lock at call. - * * Returns true if the file was installed successfully. false indicates that * max_segno limit was exceeded, or an error occurred while renaming the * file into place. */ static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, - bool find_free, XLogSegNo max_segno, - bool use_lock) + bool find_free, XLogSegNo max_segno) { char path[MAXPGPATH]; struct stat stat_buf; XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size); - /* - * We want to be sure that only one process does this at a time. - */ - if (use_lock) - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); if (!find_free) { @@ -3653,8 +3638,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, if ((*segno) >= max_segno) { /* Failed to find a free slot within specified range */ - if (use_lock) - LWLockRelease(ControlFileLock); + LWLockRelease(ControlFileLock); return false; } (*segno)++; @@ -3668,14 +3652,12 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, */ if (durable_rename_excl(tmppath, path, LOG) != 0) { - if (use_lock) - LWLockRelease(ControlFileLock); + LWLockRelease(ControlFileLock); /* durable_rename_excl already emitted log message */ return false; } - if (use_lock) - LWLockRelease(ControlFileLock); + LWLockRelease(ControlFileLock); return true; } @@ -3946,7 +3928,7 @@ PreallocXlogFiles(XLogRecPtr endptr) { _logSegNo++; use_existent = true; - lf = XLogFileInit(_logSegNo, &use_existent, true); + lf = XLogFileInit(_logSegNo, &use_existent); close(lf); if (!use_existent) CheckpointStats.ckpt_segs_added++; @@ -4223,7 +4205,7 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, *endlogSegNo <= recycleSegNo && lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && InstallXLogFileSegment(endlogSegNo, path, - true, recycleSegNo, true)) + true, recycleSegNo)) { ereport(DEBUG2, (errmsg_internal("recycled write-ahead log file \"%s\"", @@ -5341,7 +5323,7 @@ BootStrapXLOG(void) /* Create first XLOG segment file */ use_existent = false; - openLogFile = XLogFileInit(1, &use_existent, false); + openLogFile = XLogFileInit(1, &use_existent); /* * We needn't bother with Reserve/ReleaseExternalFD here, since we'll @@ -5650,7 +5632,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) bool use_existent = true; int fd; - fd = XLogFileInit(startLogSegNo, &use_existent, true); + fd = XLogFileInit(startLogSegNo, &use_existent); if (close(fd) != 0) { diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index faeea9f..b00066e 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -924,7 +924,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) /* Create/use new log file */ XLByteToSeg(recptr, recvSegNo, wal_segment_size); use_existent = true; - recvFile = XLogFileInit(recvSegNo, &use_existent, true); + recvFile = XLogFileInit(recvSegNo, &use_existent); recvFileTLI = ThisTimeLineID; } diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 77187c1..a175649 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -296,7 +296,7 @@ extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); -extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock); +extern int XLogFileInit(XLogSegNo segno, bool *use_existent); extern int XLogFileOpen(XLogSegNo segno); extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
Author: Noah Misch <n...@leadboat.com> Commit: Noah Misch <n...@leadboat.com> In XLogFileInit(), fix *use_existent postcondition to suit callers. Infrequently, the mismatch caused log_checkpoints messages and TRACE_POSTGRESQL_CHECKPOINT_DONE() to witness an "added" count too high by one. Since that consequence is so minor, no back-patch. Discussion: https://postgr.es/m/20210202151416.gb3304...@rfd.leadboat.com diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 39a38ba..073dabc 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3261,8 +3261,8 @@ XLogNeedsFlush(XLogRecPtr record) * logsegno: identify segment to be created/opened. * * *use_existent: if true, OK to use a pre-existing file (else, any - * pre-existing file will be deleted). On return, true if a pre-existing - * file was used. + * pre-existing file will be deleted). On return, false iff this call added + * some segment on disk. * * Returns FD of opened file. * @@ -3431,8 +3431,10 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent) * CheckPointSegments. */ max_segno = logsegno + CheckPointSegments; - if (!InstallXLogFileSegment(&installed_segno, tmppath, - *use_existent, max_segno)) + if (InstallXLogFileSegment(&installed_segno, tmppath, + *use_existent, max_segno)) + *use_existent = false; + else { /* * No need for any more future segments, or InstallXLogFileSegment() @@ -3442,9 +3444,6 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent) unlink(tmppath); } - /* Set flag to tell caller there was no existent file */ - *use_existent = false; - /* Now open original target segment (might not be file I just made) */ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); if (fd < 0)
Author: Noah Misch <n...@leadboat.com> Commit: Noah Misch <n...@leadboat.com> Remove XLogFileInit() ability to unlink a pre-existing file. Only initdb used it. initdb refuses to operate on a non-empty directory and generally does not cope with pre-existing files of other kinds. Hence, use the opportunity to simplify. Discussion: https://postgr.es/m/20210202151416.gb3304...@rfd.leadboat.com diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 073dabc..1a31788 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2424,7 +2424,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) bool ispartialpage; bool last_iteration; bool finishing_seg; - bool use_existent; + bool added; int curridx; int npages; int startidx; @@ -2490,8 +2490,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) wal_segment_size); /* create/use new log file */ - use_existent = true; - openLogFile = XLogFileInit(openLogSegNo, &use_existent); + openLogFile = XLogFileInit(openLogSegNo, &added); ReserveExternalFD(); } @@ -3260,9 +3259,7 @@ XLogNeedsFlush(XLogRecPtr record) * * logsegno: identify segment to be created/opened. * - * *use_existent: if true, OK to use a pre-existing file (else, any - * pre-existing file will be deleted). On return, false iff this call added - * some segment on disk. + * *added: on return, true if this call raised the number of extant segments. * * Returns FD of opened file. * @@ -3272,7 +3269,7 @@ XLogNeedsFlush(XLogRecPtr record) * in a critical section. */ int -XLogFileInit(XLogSegNo logsegno, bool *use_existent) +XLogFileInit(XLogSegNo logsegno, bool *added) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; @@ -3287,19 +3284,17 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent) /* * Try to use existent file (checkpoint maker may have created it already) */ - if (*use_existent) + *added = false; + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + if (fd < 0) { - fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); - if (fd < 0) - { - if (errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); - } - else - return fd; + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); } + else + return fd; /* * Initialize an empty (all zeroes) segment. NOTE: it is possible that @@ -3412,12 +3407,9 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent) errmsg("could not close file \"%s\": %m", tmppath))); /* - * Now move the segment into place with its final name. - * - * If caller didn't want to use a pre-existing file, get rid of any - * pre-existing file. Otherwise, cope with possibility that someone else - * has created the file while we were filling ours: if so, use ours to - * pre-create a future log segment. + * Now move the segment into place with its final name. Cope with + * possibility that someone else has created the file while we were + * filling ours: if so, use ours to pre-create a future log segment. */ installed_segno = logsegno; @@ -3431,9 +3423,8 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent) * CheckPointSegments. */ max_segno = logsegno + CheckPointSegments; - if (InstallXLogFileSegment(&installed_segno, tmppath, - *use_existent, max_segno)) - *use_existent = false; + if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno)) + *added = true; else { /* @@ -3918,7 +3909,7 @@ PreallocXlogFiles(XLogRecPtr endptr) { XLogSegNo _logSegNo; int lf; - bool use_existent; + bool added; uint64 offset; XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size); @@ -3926,10 +3917,9 @@ PreallocXlogFiles(XLogRecPtr endptr) if (offset >= (uint32) (0.75 * wal_segment_size)) { _logSegNo++; - use_existent = true; - lf = XLogFileInit(_logSegNo, &use_existent); + lf = XLogFileInit(_logSegNo, &added); close(lf); - if (!use_existent) + if (added) CheckpointStats.ckpt_segs_added++; } } @@ -5224,7 +5214,7 @@ BootStrapXLOG(void) XLogLongPageHeader longpage; XLogRecord *record; char *recptr; - bool use_existent; + bool added; uint64 sysidentifier; struct timeval tv; pg_crc32c crc; @@ -5321,8 +5311,7 @@ BootStrapXLOG(void) record->xl_crc = crc; /* Create first XLOG segment file */ - use_existent = false; - openLogFile = XLogFileInit(1, &use_existent); + openLogFile = XLogFileInit(1, &added); /* * We needn't bother with Reserve/ReleaseExternalFD here, since we'll @@ -5628,10 +5617,10 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * The switch happened at a segment boundary, so just create the next * segment on the new timeline. */ - bool use_existent = true; + bool added; int fd; - fd = XLogFileInit(startLogSegNo, &use_existent); + fd = XLogFileInit(startLogSegNo, &added); if (close(fd) != 0) { diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index b00066e..eadff8f 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -885,7 +885,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) if (recvFile < 0 || !XLByteInSeg(recptr, recvSegNo, wal_segment_size)) { - bool use_existent; + bool added; /* * fsync() and close current file before we switch to next one. We @@ -923,8 +923,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) /* Create/use new log file */ XLByteToSeg(recptr, recvSegNo, wal_segment_size); - use_existent = true; - recvFile = XLogFileInit(recvSegNo, &use_existent); + recvFile = XLogFileInit(recvSegNo, &added); recvFileTLI = ThisTimeLineID; } diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a175649..d8b8f59 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -296,7 +296,7 @@ extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); -extern int XLogFileInit(XLogSegNo segno, bool *use_existent); +extern int XLogFileInit(XLogSegNo segno, bool *added); extern int XLogFileOpen(XLogSegNo segno); extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
Author: Noah Misch <n...@leadboat.com> Commit: Noah Misch <n...@leadboat.com> Don't ERROR on PreallocXlogFiles() race condition. Before a restartpoint finishes PreallocXlogFiles(), a startup process KeepFileRestoredFromArchive() call can unlink the preallocated segment. If a CHECKPOINT sql command had elicited the restartpoint experiencing the race condition, that sql command failed. Moreover, the restartpoint omitted its log_checkpoints message and some inessential resource reclamation. Prevent the ERROR by skipping open() of the segment. Since these consequences are so minor, no back-patch. Discussion: https://postgr.es/m/20210202151416.gb3304...@rfd.leadboat.com diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1a31788..8cda20d 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2424,7 +2424,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) bool ispartialpage; bool last_iteration; bool finishing_seg; - bool added; int curridx; int npages; int startidx; @@ -2490,7 +2489,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) wal_segment_size); /* create/use new log file */ - openLogFile = XLogFileInit(openLogSegNo, &added); + openLogFile = XLogFileInit(openLogSegNo); ReserveExternalFD(); } @@ -3255,23 +3254,21 @@ XLogNeedsFlush(XLogRecPtr record) } /* - * Create a new XLOG file segment, or open a pre-existing one. + * Try to make a given XLOG file segment exist. * - * logsegno: identify segment to be created/opened. + * logsegno: identify segment. * * *added: on return, true if this call raised the number of extant segments. * - * Returns FD of opened file. + * path: on return, this char[MAXPGPATH] has the path to the logsegno file. * - * Note: errors here are ERROR not PANIC because we might or might not be - * inside a critical section (eg, during checkpoint there is no reason to - * take down the system on failure). They will promote to PANIC if we are - * in a critical section. + * Returns -1 or FD of opened file. A -1 here is not an error; a caller + * wanting an open segment should attempt to open "path", which usually will + * succeed. (This is weird, but it's efficient for the callers.) */ -int -XLogFileInit(XLogSegNo logsegno, bool *added) +static int +XLogFileInitInternal(XLogSegNo logsegno, bool *added, char *path) { - char path[MAXPGPATH]; char tmppath[MAXPGPATH]; PGAlignedXLogBlock zbuffer; XLogSegNo installed_segno; @@ -3424,26 +3421,53 @@ XLogFileInit(XLogSegNo logsegno, bool *added) */ max_segno = logsegno + CheckPointSegments; if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno)) + { *added = true; + elog(DEBUG2, "done creating and filling new WAL file"); + } else { /* * No need for any more future segments, or InstallXLogFileSegment() - * failed to rename the file into place. If the rename failed, opening - * the file below will fail. + * failed to rename the file into place. If the rename failed, a + * caller opening the file may fail. */ unlink(tmppath); + elog(DEBUG2, "abandoned new WAL file"); } + return -1; +} + +/* + * Create a new XLOG file segment, or open a pre-existing one. + * + * logsegno: identify segment to be created/opened. + * + * Returns FD of opened file. + * + * Note: errors here are ERROR not PANIC because we might or might not be + * inside a critical section (eg, during checkpoint there is no reason to + * take down the system on failure). They will promote to PANIC if we are + * in a critical section. + */ +int +XLogFileInit(XLogSegNo logsegno) +{ + bool ignore_added; + char path[MAXPGPATH]; + int fd; + + fd = XLogFileInitInternal(logsegno, &ignore_added, path); + if (fd >= 0) + return fd; + /* Now open original target segment (might not be file I just made) */ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); - - elog(DEBUG2, "done creating and filling new WAL file"); - return fd; } @@ -3903,6 +3927,15 @@ XLogFileClose(void) * High-volume systems will be OK once they've built up a sufficient set of * recycled log segments, but the startup transient is likely to include * a lot of segment creations by foreground processes, which is not so good. + * + * XLogFileInitInternal() can ereport(ERROR). All known causes indicate big + * trouble; for example, a full filesystem is one cause. The checkpoint WAL + * and/or ControlFile updates already completed. If a RequestCheckpoint() + * initiated the present checkpoint and an ERROR ends this function, the + * command that called RequestCheckpoint() fails. That's not ideal, but it's + * not worth contorting more functions to use caller-specified elevel values. + * (With or without RequestCheckpoint(), an ERROR forestalls some inessential + * reporting and resource reclamation.) */ static void PreallocXlogFiles(XLogRecPtr endptr) @@ -3910,6 +3943,7 @@ PreallocXlogFiles(XLogRecPtr endptr) XLogSegNo _logSegNo; int lf; bool added; + char path[MAXPGPATH]; uint64 offset; XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size); @@ -3917,8 +3951,9 @@ PreallocXlogFiles(XLogRecPtr endptr) if (offset >= (uint32) (0.75 * wal_segment_size)) { _logSegNo++; - lf = XLogFileInit(_logSegNo, &added); - close(lf); + lf = XLogFileInitInternal(_logSegNo, &added, path); + if (lf >= 0) + close(lf); if (added) CheckpointStats.ckpt_segs_added++; } @@ -5214,7 +5249,6 @@ BootStrapXLOG(void) XLogLongPageHeader longpage; XLogRecord *record; char *recptr; - bool added; uint64 sysidentifier; struct timeval tv; pg_crc32c crc; @@ -5311,7 +5345,7 @@ BootStrapXLOG(void) record->xl_crc = crc; /* Create first XLOG segment file */ - openLogFile = XLogFileInit(1, &added); + openLogFile = XLogFileInit(1); /* * We needn't bother with Reserve/ReleaseExternalFD here, since we'll @@ -5617,10 +5651,9 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * The switch happened at a segment boundary, so just create the next * segment on the new timeline. */ - bool added; int fd; - fd = XLogFileInit(startLogSegNo, &added); + fd = XLogFileInit(startLogSegNo); if (close(fd) != 0) { diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index eadff8f..2be9ad9 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -885,8 +885,6 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) if (recvFile < 0 || !XLByteInSeg(recptr, recvSegNo, wal_segment_size)) { - bool added; - /* * fsync() and close current file before we switch to next one. We * would otherwise have to reopen this file to fsync it later @@ -923,7 +921,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) /* Create/use new log file */ XLByteToSeg(recptr, recvSegNo, wal_segment_size); - recvFile = XLogFileInit(recvSegNo, &added); + recvFile = XLogFileInit(recvSegNo); recvFileTLI = ThisTimeLineID; } diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d8b8f59..7510e88 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -296,7 +296,7 @@ extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); -extern int XLogFileInit(XLogSegNo segno, bool *added); +extern int XLogFileInit(XLogSegNo segno); extern int XLogFileOpen(XLogSegNo segno); extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
Author: Noah Misch <n...@leadboat.com> Commit: Noah Misch <n...@leadboat.com> Skip WAL recycling and preallocation during archive recovery. The previous commit addressed the chief consequences of a race condition between InstallXLogFileSegment() and KeepFileRestoredFromArchive(). Fix three lesser consequences. A spurious durable_rename_excl() LOG message remained possible. KeepFileRestoredFromArchive() wasted the proceeds of WAL recycling and preallocation. Finally, XLogFileInitInternal() could return a descriptor for a file that KeepFileRestoredFromArchive() had already unlinked. That felt like a recipe for future bugs. Discussion: https://postgr.es/m/20210202151416.gb3304...@rfd.leadboat.com diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8cda20d..2c6e21b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -663,6 +663,16 @@ typedef struct XLogCtlData bool SharedHotStandbyActive; /* + * InstallXLogFileSegmentActive indicates whether the checkpointer should + * arrange for future segments by recycling and/or PreallocXlogFiles(). + * Protected by ControlFileLock. Only the startup process changes it. If + * true, anyone can use InstallXLogFileSegment(). If false, the startup + * process owns the exclusive right to install segments, by reading from + * the archive and possibly replacing existing files. + */ + bool InstallXLogFileSegmentActive; + + /* * SharedPromoteIsTriggered indicates if a standby promotion has been * triggered. Protected by info_lck. */ @@ -921,6 +931,7 @@ static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf); static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr); +static void XLogShutdownWalRcv(void); static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); static void PreallocXlogFiles(XLogRecPtr endptr); @@ -3625,8 +3636,8 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, * is false.) * * Returns true if the file was installed successfully. false indicates that - * max_segno limit was exceeded, or an error occurred while renaming the - * file into place. + * max_segno limit was exceeded, the startup process has disabled this + * function for now, or an error occurred while renaming the file into place. */ static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, @@ -3638,6 +3649,11 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size); LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (!XLogCtl->InstallXLogFileSegmentActive) + { + LWLockRelease(ControlFileLock); + return false; + } if (!find_free) { @@ -3745,6 +3761,7 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, */ if (source == XLOG_FROM_ARCHIVE) { + Assert(!XLogCtl->InstallXLogFileSegmentActive); KeepFileRestoredFromArchive(path, xlogfname); /* @@ -3946,6 +3963,9 @@ PreallocXlogFiles(XLogRecPtr endptr) char path[MAXPGPATH]; uint64 offset; + if (!XLogCtl->InstallXLogFileSegmentActive) + return; /* unlocked check says no */ + XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size); offset = XLogSegmentOffset(endptr - 1, wal_segment_size); if (offset >= (uint32) (0.75 * wal_segment_size)) @@ -4227,6 +4247,7 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, */ if (wal_recycle && *endlogSegNo <= recycleSegNo && + XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */ lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && InstallXLogFileSegment(endlogSegNo, path, true, recycleSegNo)) @@ -4240,7 +4261,7 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, } else { - /* No need for any more future segments... */ + /* No need for any more future segments, or recycling failed ... */ int rc; ereport(DEBUG2, @@ -5226,6 +5247,7 @@ XLOGShmemInit(void) XLogCtl->XLogCacheBlck = XLOGbuffers - 1; XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; XLogCtl->SharedHotStandbyActive = false; + XLogCtl->InstallXLogFileSegmentActive = false; XLogCtl->SharedPromoteIsTriggered = false; XLogCtl->WalWriterSleeping = false; @@ -5253,6 +5275,11 @@ BootStrapXLOG(void) struct timeval tv; pg_crc32c crc; + /* allow ordinary WAL segment creation, like StartupXLOG() would */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = true; + LWLockRelease(ControlFileLock); + /* * Select a hopefully-unique system identifier code for this installation. * We use the result of gettimeofday(), including the fractional seconds @@ -7619,7 +7646,7 @@ StartupXLOG(void) * the startup checkpoint record. It will trump over the checkpoint and * subsequent records if it's still alive when we start writing WAL. */ - ShutdownWalRcv(); + XLogShutdownWalRcv(); /* * Reset unlogged relations to the contents of their INIT fork. This is @@ -7644,7 +7671,7 @@ StartupXLOG(void) * recovery, e.g., timeline history file) from archive or pg_wal. * * Note that standby mode must be turned off after killing WAL receiver, - * i.e., calling ShutdownWalRcv(). + * i.e., calling XLogShutdownWalRcv(). */ Assert(!WalRcvStreaming()); StandbyMode = false; @@ -7710,6 +7737,14 @@ StartupXLOG(void) oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); /* + * Allow ordinary WAL segment creation before any exitArchiveRecovery(), + * which sometimes creates a segment, and after the last ReadRecord(). + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = true; + LWLockRelease(ControlFileLock); + + /* * Consider whether we need to assign a new timeline ID. * * If we are doing an archive recovery, we always assign a new ID. This @@ -12378,7 +12413,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, */ if (StandbyMode && CheckForStandbyTrigger()) { - ShutdownWalRcv(); + XLogShutdownWalRcv(); return false; } @@ -12426,7 +12461,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, * WAL that we restore from archive. */ if (WalRcvStreaming()) - ShutdownWalRcv(); + XLogShutdownWalRcv(); /* * Before we sleep, re-scan for possible new timelines if @@ -12553,7 +12588,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, */ if (pendingWalRcvRestart && !startWalReceiver) { - ShutdownWalRcv(); + XLogShutdownWalRcv(); /* * Re-scan for possible new timelines if we were @@ -12603,6 +12638,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, tli, curFileTLI); } curFileTLI = tli; + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = true; + LWLockRelease(ControlFileLock); RequestXLogStreaming(tli, ptr, PrimaryConnInfo, PrimarySlotName, wal_receiver_create_temp_slot); @@ -12770,6 +12808,17 @@ StartupRequestWalReceiverRestart(void) } } +/* Thin wrapper around ShutdownWalRcv(). */ +static void +XLogShutdownWalRcv(void) +{ + ShutdownWalRcv(); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = false; + LWLockRelease(ControlFileLock); +} + /* * Determine what log level should be used to report a corrupt WAL record * in the current WAL page, previously read by XLogPageRead().