On Mon, Apr 4, 2016 at 3:22 PM, Michael Paquier <michael.paqu...@gmail.com> wrote: > The approach introducing the concept of WAL progress addresses the > problem of unnecessary checkpoints and of unnecessary standby > snapshots. If we take the problem only from the angle of RUNNING_XACTS > the current logic used to check if a checkpoint should be skipped or > not is not addressed.
This discussion has been stalling for a while regarding the main patches... Attached are rebased versions based on the approach with XLogInsertExtended(). And I have added as well a couple of days ago an entry on the 9.6 open item page, not as a 9.6 open item, but as an older bug to keep at least track of this problem. -- Michael
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index d60c123..5ae6f49 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8357,8 +8357,12 @@ CreateCheckPoint(int flags) if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE)) == 0) { + elog(LOG, "Not a forced or shutdown checkpoint: progress_lsn %X/%X, ckpt %X/%X", + (uint32) (progress_lsn >> 32), (uint32) progress_lsn, + (uint32) (ControlFile->checkPoint >> 32), (uint32) ControlFile->checkPoint); if (progress_lsn == ControlFile->checkPoint) { + elog(LOG, "Checkpoint is skipped"); WALInsertLockRelease(); LWLockRelease(CheckpointLock); END_CRIT_SECTION(); @@ -8525,7 +8529,11 @@ CreateCheckPoint(int flags) * recovery we don't need to write running xact data. */ if (!shutdown && XLogStandbyInfoActive()) - LogStandbySnapshot(); + { + XLogRecPtr lsn = LogStandbySnapshot(); + elog(LOG, "snapshot taken by checkpoint %X/%X", + (uint32) (lsn >> 32), (uint32) lsn); + } START_CRIT_SECTION(); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 79cfd7b..082e589 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -333,7 +333,9 @@ BackgroundWriterMain(void) GetLastCheckpointRecPtr() < current_progress_lsn && last_progress_lsn < current_progress_lsn) { - (void) LogStandbySnapshot(); + XLogRecPtr lsn = LogStandbySnapshot(); + elog(LOG, "snapshot taken by bgwriter %X/%X", + (uint32) (lsn >> 32), (uint32) lsn); last_snapshot_ts = now; last_progress_lsn = current_progress_lsn; }
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f9644db..d60c123 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -439,11 +439,30 @@ typedef struct XLogwrtResult * the WAL record is just copied to the page and the lock is released. But * to avoid the deadlock-scenario explained above, the indicator is always * updated before sleeping while holding an insertion lock. + * + * The progressAt values indicate the insertion progress used to determine + * WAL insertion activity since a previous checkpoint, which is aimed at + * finding out if a checkpoint should be skipped or not or if standby + * activity should be logged. Progress position is basically updated + * for all types of records, for the time being only snapshot logging + * is out of this scope to properly skip their logging on idle systems. + * Tracking the WAL activity directly in WALInsertLock has the advantage + * to not rely on taking an exclusive lock on all the WAL insertion locks, + * hence reducing the impact of the activity lookup. This takes also + * advantage to avoid 8-byte torn reads on some platforms by using the + * fact that each insert lock is located on the same cache line. + * XXX: There is still room for more improvements here, particularly + * WAL operations related to unlogged relations (INIT_FORKNUM) should not + * update the progress LSN as those relations are reset during crash + * recovery so enforcing buffers of such relations to be flushed for + * example in the case of a load only on unlogged relations is a waste + * of disk write. */ typedef struct { LWLock lock; XLogRecPtr insertingAt; + XLogRecPtr progressAt; } WALInsertLock; /* @@ -881,6 +900,9 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); * which pages need a full-page image, and retry. If fpw_lsn is invalid, the * record is always inserted. * + * 'flags' gives more in-depth control on the record being inserted. As of + * now, this controls if the progress LSN positions are updated. + * * The first XLogRecData in the chain must be for the record header, and its * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and * xl_crc fields in the header, the rest of the header must already be filled @@ -893,7 +915,9 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); * WAL rule "write the log before the data".) */ XLogRecPtr -XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) +XLogInsertRecord(XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags) { XLogCtlInsert *Insert = &XLogCtl->Insert; pg_crc32c rdata_crc; @@ -992,6 +1016,25 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) inserted = true; } + /* + * Update the progress LSN positions. At least one WAL insertion lock + * is already taken appropriately before doing that, and it is just more + * simple to do that here where WAL record data and type is at hand. + * The progress is set at the start position of the record tracked that + * is being added, making easier checkpoint progress tracking as the + * control file already saves the start LSN position of the last + * checkpoint run. If an exclusive lock is taken for WAL insertion, + * there is actually no need to update all the progression fields, so + * just do it on the first one. + */ + if ((flags & XLOG_INSERT_NO_PROGRESS) == 0) + { + if (holdingAllLocks) + WALInsertLocks[0].l.progressAt = StartPos; + else + WALInsertLocks[MyLockNo].l.progressAt = StartPos; + } + if (inserted) { /* @@ -4717,6 +4760,7 @@ XLOGShmemInit(void) { LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT); WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr; + WALInsertLocks[i].l.progressAt = InvalidXLogRecPtr; } /* @@ -7901,6 +7945,55 @@ GetFlushRecPtr(void) } /* + * GetProgressRecPtr -- Returns the newest WAL activity position, aimed + * at the last significant WAL activity, or in other words any activity + * not referring to standby logging as of now. Finding the last activity + * position is done by scanning each WAL insertion lock by taking directly + * the light-weight lock associated to it. + */ +XLogRecPtr +GetProgressRecPtr(void) +{ + XLogRecPtr res = InvalidXLogRecPtr; + int i; + + /* + * Look at the latest LSN position referring to the activity done by + * WAL insertion. An exclusive lock is taken because currently the + * locking logic for WAL insertion only expects such a level of locking. + * Taking a lock is as well necessary to prevent potential torn reads + * on some platforms. + */ + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + XLogRecPtr progress_lsn; + + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + progress_lsn = WALInsertLocks[i].l.progressAt; + LWLockRelease(&WALInsertLocks[i].l.lock); + + if (res < progress_lsn) + res = progress_lsn; + } + + return res; +} + +/* + * GetLastCheckpointRecPtr -- Returns the last checkpoint insert position. + */ +XLogRecPtr +GetLastCheckpointRecPtr(void) +{ + XLogRecPtr ckpt_lsn; + + LWLockAcquire(ControlFileLock, LW_SHARED); + ckpt_lsn = ControlFile->checkPoint; + LWLockRelease(ControlFileLock); + return ckpt_lsn; +} + +/* * Get the time of the last xlog segment switch */ pg_time_t @@ -8160,7 +8253,7 @@ CreateCheckPoint(int flags) uint32 freespace; XLogRecPtr PriorRedoPtr; XLogRecPtr curInsert; - XLogRecPtr prevPtr; + XLogRecPtr progress_lsn; VirtualTransactionId *vxids; int nvxids; @@ -8241,34 +8334,30 @@ CreateCheckPoint(int flags) checkPoint.oldestActiveXid = InvalidTransactionId; /* + * Get progress before acquiring insert locks to shorten the locked + * section waiting ahead. + */ + progress_lsn = GetProgressRecPtr(); + + /* * We must block concurrent insertions while examining insert state to * determine the checkpoint REDO pointer. */ WALInsertLockAcquireExclusive(); curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); - prevPtr = XLogBytePosToRecPtr(Insert->PrevBytePos); /* - * If this isn't a shutdown or forced checkpoint, and we have not inserted - * any XLOG records since the start of the last checkpoint, skip the - * checkpoint. The idea here is to avoid inserting duplicate checkpoints - * when the system is idle. That wastes log space, and more importantly it - * exposes us to possible loss of both current and previous checkpoint - * records if the machine crashes just as we're writing the update. - * (Perhaps it'd make even more sense to checkpoint only when the previous - * checkpoint record is in a different xlog page?) - * - * If the previous checkpoint crossed a WAL segment, however, we create - * the checkpoint anyway, to have the latest checkpoint fully contained in - * the new segment. This is for a little bit of extra robustness: it's - * better if you don't need to keep two WAL segments around to recover the - * checkpoint. + * If this isn't a shutdown or forced checkpoint, and if there has been no + * WAL activity, skip the checkpoint. The idea here is to avoid inserting + * duplicate checkpoints when the system is idle. That wastes log space, + * and more importantly it exposes us to possible loss of both current and + * previous checkpoint records if the machine crashes just as we're writing + * the update. */ if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE)) == 0) { - if (prevPtr == ControlFile->checkPointCopy.redo && - prevPtr / XLOG_SEG_SIZE == curInsert / XLOG_SEG_SIZE) + if (progress_lsn == ControlFile->checkPoint) { WALInsertLockRelease(); LWLockRelease(CheckpointLock); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 1e336ed..137492d 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -394,9 +394,23 @@ XLogIncludeOrigin(void) } /* + * XLogInsert + * + * A shorthand for XLogInsertExtended, to update the progress of WAL + * activity by default. + */ +XLogRecPtr +XLogInsert(RmgrId rmid, uint8 info) +{ + return XLogInsertExtended(rmid, info, 0); +} + +/* * Insert an XLOG record having the specified RMID and info bytes, with the * body of the record being the data and buffer references registered earlier - * with XLogRegister* calls. + * with XLogRegister* calls. 'flags' allow users to control more in-depth + * operations during WAL record insertion. As of now, this gives control on + * if the progress LSN positions are updated or not. * * Returns XLOG pointer to end of record (beginning of next record). * This can be used as LSN for data pages affected by the logged action. @@ -405,7 +419,7 @@ XLogIncludeOrigin(void) * WAL rule "write the log before the data".) */ XLogRecPtr -XLogInsert(RmgrId rmid, uint8 info) +XLogInsertExtended(RmgrId rmid, uint8 info, uint8 flags) { XLogRecPtr EndPos; @@ -450,7 +464,7 @@ XLogInsert(RmgrId rmid, uint8 info) rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, &fpw_lsn); - EndPos = XLogInsertRecord(rdt, fpw_lsn); + EndPos = XLogInsertRecord(rdt, fpw_lsn, flags); } while (EndPos == InvalidXLogRecPtr); XLogResetInsertion(); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 00f03d8..79cfd7b 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -78,12 +78,12 @@ int BgWriterDelay = 200; #define LOG_SNAPSHOT_INTERVAL_MS 15000 /* - * LSN and timestamp at which we last issued a LogStandbySnapshot(), to avoid - * doing so too often or repeatedly if there has been no other write activity - * in the system. + * Last progress LSN and timestamp at which we last logged a standby + * snapshot, to avoid doing so too often or repeatedly if there has been + * no other write activity in the system. */ static TimestampTz last_snapshot_ts; -static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr; +static XLogRecPtr last_progress_lsn = InvalidXLogRecPtr; /* * Flags set by interrupt handlers for later service in the main loop. @@ -310,7 +310,7 @@ BackgroundWriterMain(void) * check whether there has been any WAL inserted since the last time * we've logged a running xacts. * - * We do this logging in the bgwriter as its the only process that is + * We do this logging in the bgwriter as it is the only process that is * run regularly and returns to its mainloop all the time. E.g. * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. @@ -319,19 +319,23 @@ BackgroundWriterMain(void) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); + XLogRecPtr current_progress_lsn = GetProgressRecPtr(); timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS); /* - * only log if enough time has passed and some xlog record has - * been inserted. + * only log if enough time has passed, that some WAL activity + * has happened since last checkpoint, and that some xlog record + * has been inserted. */ if (now >= timeout && - last_snapshot_lsn != GetXLogInsertRecPtr()) + GetLastCheckpointRecPtr() < current_progress_lsn && + last_progress_lsn < current_progress_lsn) { - last_snapshot_lsn = LogStandbySnapshot(); + (void) LogStandbySnapshot(); last_snapshot_ts = now; + last_progress_lsn = current_progress_lsn; } } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 6a9bf84..75c367c 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -952,7 +952,8 @@ LogStandbySnapshot(void) * The definitions of RunningTransactionsData and xl_xact_running_xacts * are similar. We keep them separate because xl_xact_running_xacts * is a contiguous chunk of memory and never exists fully until it is - * assembled in WAL. + * assembled in WAL. Progress of WAL activity is not updated when + * this record is logged. */ static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) @@ -976,7 +977,9 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) XLogRegisterData((char *) CurrRunningXacts->xids, (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId)); - recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS); + recptr = XLogInsertExtended(RM_STANDBY_ID, + XLOG_RUNNING_XACTS, + XLOG_INSERT_NO_PROGRESS); if (CurrRunningXacts->subxid_overflow) elog(trace_recovery(DEBUG2), @@ -1024,7 +1027,8 @@ LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks) XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks)); XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock)); - (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK); + (void) XLogInsertExtended(RM_STANDBY_ID, XLOG_STANDBY_LOCK, + XLOG_INSERT_NO_PROGRESS); } /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 0744c3f..794388e 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -209,7 +209,9 @@ extern CheckpointStatsData CheckpointStats; struct XLogRecData; -extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, XLogRecPtr fpw_lsn); +extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags); extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); @@ -260,6 +262,8 @@ extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetFlushRecPtr(void); +extern XLogRecPtr GetProgressRecPtr(void); +extern XLogRecPtr GetLastCheckpointRecPtr(void); extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch); extern void RemovePromoteSignalFiles(void); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index cc0177e..6ff0176 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -38,10 +38,15 @@ #define REGBUF_KEEP_DATA 0x10/* include data even if a full-page image is * taken */ +/* flags for XLogInsertExtended */ +#define XLOG_INSERT_NO_PROGRESS 0x01 /* do not update progress LSN + * when inserting record */ + /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); extern void XLogIncludeOrigin(void); extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info); +extern XLogRecPtr XLogInsertExtended(RmgrId rmid, uint8 info, uint8 flags); extern void XLogEnsureRecordSpace(int nbuffers, int ndatas); extern void XLogRegisterData(char *data, int len); extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers