On Thu, 2008-09-18 at 10:09 -0400, Tom Lane wrote: > Simon Riggs <[EMAIL PROTECTED]> writes: > > On Thu, 2008-09-18 at 09:06 -0400, Tom Lane wrote: > >> Do we really need a checkpoint there at all? > > > "Timelines only change at shutdown checkpoints". > > Hmm. I *think* that that is just a debugging crosscheck rather than a > critical property. But yeah, it would take some close investigation, > which maybe isn't warranted if you have a less-invasive solution.
OK, new patch, version 6. Some major differences to previous patch. * new IsRecoveryProcessingMode() in shmem * padding in XLogCtl to ensure above call is cheap * specific part of bgwriter shmem for passing restartpoint data * avoid Shutdown checkpoint at end of recovery, with carefully considered positioning of statements (beware!) * only one new postmaster mode, PM_RECOVERY * bgwriter changes state without stopping/starting Modes I have tested so far * make check * Start, Stop * Crash Recovery * Archive Recovery * Archive Recovery, switch in middle of restartpoint Modes not yet tested * EXEC_BACKEND Ready for serious review prior to commit. I will be performing further testing also. backend/access/transam/multixact.c | 2 backend/access/transam/xlog.c | 328 ++++++++++++---!!!!!!!!!!!! backend/postmaster/bgwriter.c | 371 +++++---!!!!!!!!!!!!!!!!!!!!! backend/postmaster/postmaster.c | 62 ++++!! backend/storage/buffer/README | 5 backend/storage/buffer/bufmgr.c | 34 +!! include/access/xlog.h | 14 ! include/access/xlog_internal.h | 3 include/catalog/pg_control.h | 2 include/postmaster/bgwriter.h | 2 include/storage/bufmgr.h | 2 include/storage/pmsignal.h | 1 12 files changed, 279 insertions(+), 56 deletions(-), 491 mods(!) There's a few subtle points along the way. I've tried to explain them all in code comments, but questions welcome. At v6, most things are now done a particular way for a specific reason. Look especially at InRecovery, which is used extensively in different parts of the code. The meaning of this has been subdivided into two meanings, so only *some* of the places that use it have been changed. All have been checked. -- Simon Riggs www.2ndQuadrant.com PostgreSQL Training, Services and Support
Index: src/backend/access/transam/multixact.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/multixact.c,v retrieving revision 1.28 diff -c -r1.28 multixact.c *** src/backend/access/transam/multixact.c 1 Aug 2008 13:16:08 -0000 1.28 --- src/backend/access/transam/multixact.c 22 Sep 2008 19:28:56 -0000 *************** *** 1543,1549 **** * SimpleLruTruncate would get confused. It seems best not to risk * removing any data during recovery anyway, so don't truncate. */ ! if (!InRecovery) TruncateMultiXact(); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); --- 1543,1549 ---- * SimpleLruTruncate would get confused. It seems best not to risk * removing any data during recovery anyway, so don't truncate. */ ! if (!IsRecoveryProcessingMode()) TruncateMultiXact(); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.317 diff -c -r1.317 xlog.c *** src/backend/access/transam/xlog.c 11 Aug 2008 11:05:10 -0000 1.317 --- src/backend/access/transam/xlog.c 22 Sep 2008 21:30:24 -0000 *************** *** 119,124 **** --- 119,125 ---- /* Are we doing recovery from XLOG? */ bool InRecovery = false; + bool reachedSafeStopPoint = false; /* Are we recovering using offline XLOG archives? */ static bool InArchiveRecovery = false; *************** *** 131,137 **** static bool recoveryTarget = false; static bool recoveryTargetExact = false; static bool recoveryTargetInclusive = true; - static bool recoveryLogRestartpoints = false; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; static TimestampTz recoveryLastXTime = 0; --- 132,137 ---- *************** *** 286,295 **** --- 286,297 ---- /* * Total shared-memory state for XLOG. */ + #define XLOGCTL_BUFFER_SPACING 128 typedef struct XLogCtlData { /* Protected by WALInsertLock: */ XLogCtlInsert Insert; + char InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)]; /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; *************** *** 297,305 **** --- 299,314 ---- uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */ TransactionId ckptXid; XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */ + /* add data structure padding for above info_lck declarations */ + char InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst) + - sizeof(XLogwrtResult) + - sizeof(uint32) + - sizeof(TransactionId) + - sizeof(XLogRecPtr)]; /* Protected by WALWriteLock: */ XLogCtlWrite Write; + char WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)]; /* * These values do not change after startup, although the pointed-to pages *************** *** 311,316 **** --- 320,344 ---- int XLogCacheBlck; /* highest allocated xlog buffer index */ TimeLineID ThisTimeLineID; + /* + * IsRecoveryProcessingMode shows whether the postmaster is in a + * postmaster state earlier than PM_RUN, or not. This is a globally + * accessible state to allow EXEC_BACKEND case. + * + * We also retain a local state variable InRecovery. InRecovery=true + * means the code is being executed by Startup process and therefore + * always during RecoveryProcessingMode. This allows us to retain the + * often important distinction between code executed *during* + * RecoveryProcessingMode and but not necessarily by Startup process. + * + * Reviewer's note: all call points InRecovery and InRedo have been checked + * for correctness and have been changed to IsRecoveryProcessingMode() + * if appropriate. + */ + bool IsRecoveryProcessingMode; + + char InfoLockPadding[XLOGCTL_BUFFER_SPACING]; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; *************** *** 396,401 **** --- 424,430 ---- static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg); + static XLogRecPtr exitRecovery(void); static bool recoveryStopsHere(XLogRecord *record, bool *includeThis); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); *************** *** 479,484 **** --- 508,518 ---- bool updrqst; bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + bool isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END); + + /* cross-check on whether we should be here or not */ + if (IsRecoveryProcessingMode() && !isRecoveryEnd) + elog(FATAL, "cannot make new WAL entries during recovery"); /* info's high bits are reserved for use by me */ if (info & XLR_INFO_MASK) *************** *** 1677,1684 **** XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; ! /* Disabled during REDO */ ! if (InRedo) return; /* Quick exit if already known flushed */ --- 1711,1717 ---- XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; ! if (IsRecoveryProcessingMode()) return; /* Quick exit if already known flushed */ *************** *** 1766,1774 **** * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we ! * cannot get here while InRedo is true, but if the bad page is brought in ! * and marked dirty during recovery then CreateCheckPoint will try to ! * flush it at the end of recovery.) * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if --- 1799,1807 ---- * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we ! * cannot get here while IsRecoveryProcessingMode(), but if the bad page is ! * brought in and marked dirty during recovery then CreateCheckPoint will ! * try to flush it at the end of recovery.) * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if *************** *** 2051,2057 **** unlink(tmppath); } ! elog(DEBUG2, "done creating and filling new WAL file"); /* Set flag to tell caller there was no existent file */ *use_existent = false; --- 2084,2091 ---- unlink(tmppath); } ! XLogFileName(tmppath, ThisTimeLineID, log, seg); ! elog(DEBUG2, "done creating and filling new WAL file %s", tmppath); /* Set flag to tell caller there was no existent file */ *use_existent = false; *************** *** 4532,4546 **** } else if (strcmp(tok1, "log_restartpoints") == 0) { - /* - * does nothing if a recovery_target is not also set - */ - if (!parse_bool(tok2, &recoveryLogRestartpoints)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("parameter \"log_restartpoints\" requires a Boolean value"))); ereport(LOG, ! (errmsg("log_restartpoints = %s", tok2))); } else ereport(FATAL, --- 4566,4574 ---- } else if (strcmp(tok1, "log_restartpoints") == 0) { ereport(LOG, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("parameter \"log_restartpoints\" has been deprecated"))); } else ereport(FATAL, *************** *** 4811,4828 **** CheckPoint checkPoint; bool wasShutdown; bool reachedStopPoint = false; bool haveBackupLabel = false; XLogRecPtr RecPtr, LastRec, checkPointLoc, minRecoveryLoc, ! EndOfLog; uint32 endLogId; uint32 endLogSeg; XLogRecord *record; uint32 freespace; TransactionId oldestActiveXID; /* * Read control file and check XLOG status looks valid. * --- 4839,4860 ---- CheckPoint checkPoint; bool wasShutdown; bool reachedStopPoint = false; + bool performedArchiveRecovery = false; bool haveBackupLabel = false; XLogRecPtr RecPtr, LastRec, checkPointLoc, minRecoveryLoc, ! EndOfLog, ! RecoveryCompletionPtr; uint32 endLogId; uint32 endLogSeg; XLogRecord *record; uint32 freespace; TransactionId oldestActiveXID; + XLogCtl->IsRecoveryProcessingMode = true; + /* * Read control file and check XLOG status looks valid. * *************** *** 5039,5044 **** --- 5071,5081 ---- UpdateControlFile(); /* + * Reset pgstat data, because it may be invalid after recovery. + */ + pgstat_reset_all(); + + /* * If there was a backup label file, it's done its job and the info * has now been propagated into pg_control. We must get rid of the * label file so that if we crash during recovery, we'll pick up at *************** *** 5148,5153 **** --- 5185,5205 ---- LastRec = ReadRecPtr; + /* + * Have we reached our safe stopping point? If so, we can + * signal Postmaster to enter consistent recovery mode + */ + if (!reachedSafeStopPoint && + XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr)) + { + reachedSafeStopPoint = true; + ereport(LOG, + (errmsg("consistent recovery state reached at %X/%X", + EndRecPtr.xlogid, EndRecPtr.xrecoff))); + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_START); + } + record = ReadRecord(NULL, LOG); } while (record != NULL && recoveryContinue); *************** *** 5169,5174 **** --- 5221,5227 ---- /* there are no WAL records following the checkpoint */ ereport(LOG, (errmsg("redo is not required"))); + reachedSafeStopPoint = true; } } *************** *** 5184,5190 **** * Complain if we did not roll forward far enough to render the backup * dump consistent. */ ! if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint)) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, --- 5237,5243 ---- * Complain if we did not roll forward far enough to render the backup * dump consistent. */ ! if (InRecovery && !reachedSafeStopPoint) { if (reachedStopPoint) /* stopped because of stop request */ ereport(FATAL, *************** *** 5227,5233 **** --- 5280,5289 ---- * we will use that below.) */ if (InArchiveRecovery) + { + performedArchiveRecovery = true; exitArchiveRecovery(curFileTLI, endLogId, endLogSeg); + } /* * Prepare to write WAL starting at EndOfLog position, and init xlog *************** *** 5286,5291 **** --- 5342,5349 ---- /* Pre-scan prepared transactions to find out the range of XIDs present */ oldestActiveXID = PrescanPreparedTransactions(); + RecoveryCompletionPtr = EndOfLog; + if (InRecovery) { int rmid; *************** *** 5306,5343 **** XLogCheckInvalidPages(); /* ! * Reset pgstat data, because it may be invalid after recovery. ! */ ! pgstat_reset_all(); ! ! /* ! * Perform a checkpoint to update all our recovery activity to disk. ! * ! * Note that we write a shutdown checkpoint rather than an on-line ! * one. This is not particularly critical, but since we may be ! * assigning a new TLI, using a shutdown checkpoint allows us to have ! * the rule that TLI only changes in shutdown checkpoints, which ! * allows some extra error checking in xlog_redo. */ ! CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } - /* - * Preallocate additional log files, if wanted. - */ - PreallocXlogFiles(EndOfLog); - - /* - * Okay, we're officially UP. - */ - InRecovery = false; - - ControlFile->state = DB_IN_PRODUCTION; - ControlFile->time = (pg_time_t) time(NULL); - UpdateControlFile(); - /* start the archive_timeout timer running */ ! XLogCtl->Write.lastSegSwitchTime = ControlFile->time; /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; --- 5364,5377 ---- XLogCheckInvalidPages(); /* ! * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote ! * a shutdown checkpoint here, but we ask bgwriter to do that now. */ ! RecoveryCompletionPtr = exitRecovery(); } /* start the archive_timeout timer running */ ! XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL); /* initialize shared-memory copy of latest checkpoint XID/epoch */ XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch; *************** *** 5372,5377 **** --- 5406,5446 ---- readRecordBuf = NULL; readRecordBufSize = 0; } + + /* + * Okay, we're officially UP. + */ + XLogCtl->IsRecoveryProcessingMode = false; + + /* + * If we had to perform archive recovery we don't mark the control file, + * yet, since we haven't definitely got a safe point to recover from that + * doesn't rely on archived WAL files. So we switch quickly into normal + * processing and rely on the bgwriter's checkpoint (NOT a restartpoint) + * to define a safe recovery point and put us into full production state. + * We specifically do not want to wait for checkpoint completion here, + * so we can reduce startup time in a standby mode replication failover. + * The checkpoint creation will also flush WAL, so we wait for that + * otherwise we may need to prepare WAL files ourselves. + * + * If we are doing crash recovery, we know we have WAL files accessible + * so we just get started again as quickly as possible. + */ + if (performedArchiveRecovery) + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE); + else + { + XLogFlush(RecoveryCompletionPtr); + ControlFile->state = DB_IN_PRODUCTION; + ControlFile->time = (pg_time_t) time(NULL); + UpdateControlFile(); + } + } + + bool + IsRecoveryProcessingMode(void) + { + return XLogCtl->IsRecoveryProcessingMode; } /* *************** *** 5642,5648 **** * Log end of a checkpoint. */ static void ! LogCheckpointEnd(void) { long write_secs, sync_secs, --- 5711,5717 ---- * Log end of a checkpoint. */ static void ! LogCheckpointEnd(int flags) { long write_secs, sync_secs, *************** *** 5665,5681 **** CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " ! "%d transaction log file(s) added, %d removed, %d recycled; " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! CheckpointStats.ckpt_segs_added, ! CheckpointStats.ckpt_segs_removed, ! CheckpointStats.ckpt_segs_recycled, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); } /* --- 5734,5759 ---- CheckpointStats.ckpt_sync_end_t, &sync_secs, &sync_usecs); ! if (flags & CHECKPOINT_RESTARTPOINT) ! elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); ! else ! elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); " ! "%d transaction log file(s) added, %d removed, %d recycled; " ! "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s", ! CheckpointStats.ckpt_bufs_written, ! (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, ! CheckpointStats.ckpt_segs_added, ! CheckpointStats.ckpt_segs_removed, ! CheckpointStats.ckpt_segs_recycled, ! write_secs, write_usecs / 1000, ! sync_secs, sync_usecs / 1000, ! total_secs, total_usecs / 1000); } /* *************** *** 5944,5949 **** --- 6022,6029 ---- LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); if (shutdown) ControlFile->state = DB_SHUTDOWNED; + else + ControlFile->state = DB_IN_PRODUCTION; ControlFile->prevCheckPoint = ControlFile->checkPoint; ControlFile->checkPoint = ProcLastRecPtr; ControlFile->checkPointCopy = checkPoint; *************** *** 6002,6008 **** /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(); LWLockRelease(CheckpointLock); } --- 6082,6088 ---- /* All real work is done, but log before releasing lock. */ if (log_checkpoints) ! LogCheckpointEnd(flags); LWLockRelease(CheckpointLock); } *************** *** 6071,6099 **** } } /* ! * OK, force data out to disk */ ! CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE); /* ! * Update pg_control so that any subsequent crash will restart from this ! * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint ! * record itself. */ ControlFile->prevCheckPoint = ControlFile->checkPoint; ! ControlFile->checkPoint = ReadRecPtr; ! ControlFile->checkPointCopy = *checkPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", ! checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); ! if (recoveryLastXTime) ! ereport((recoveryLogRestartpoints ? LOG : DEBUG2), ! (errmsg("last completed transaction was at log time %s", ! timestamptz_to_str(recoveryLastXTime)))); } /* --- 6151,6215 ---- } } + if (recoveryLastXTime) + ereport((log_checkpoints ? LOG : DEBUG2), + (errmsg("last completed transaction was at log time %s", + timestamptz_to_str(recoveryLastXTime)))); + + RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStopPoint); + } + + /* + * As of 8.4, RestartPoints are always created by the bgwriter + * once we have reachedSafeStopPoint. We use bgwriter's shared memory + * area wherever we call it from, to keep better code structure. + */ + void + CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint) + { + if (log_checkpoints) + { + /* + * Prepare to accumulate statistics. + */ + + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + /* + * Do the restartpoint equivalent of LogCheckpointStart() + */ + elog(LOG, "restartpoint starting"); + } + + LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); + /* ! * OK, write out dirty blocks smoothly */ ! CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT); /* ! * Update pg_control, using current time */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->prevCheckPoint = ControlFile->checkPoint; ! ControlFile->checkPoint = ReadPtr; ! ControlFile->checkPointCopy = *restartPoint; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* All real work is done, but log before releasing lock. */ + if (log_checkpoints) + LogCheckpointEnd(CHECKPOINT_RESTARTPOINT); ! ereport((log_checkpoints ? LOG : DEBUG2), (errmsg("recovery restart point at %X/%X", ! restartPoint->redo.xlogid, restartPoint->redo.xrecoff))); ! ! LWLockRelease(CheckpointLock); ! } /* *************** *** 6158,6164 **** } /* ! * XLOG resource manager's routines */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) --- 6274,6313 ---- } /* ! * exitRecovery() ! * ! * Exit recovery state and write a XLOG_RECOVERY_END record. This is the ! * only record type that can record a change of timelineID. We assume ! * caller has already set ThisTimeLineID, if appropriate. ! */ ! static XLogRecPtr ! exitRecovery(void) ! { ! XLogRecPtr RecPtr; ! XLogRecData rdata; ! ! rdata.buffer = InvalidBuffer; ! rdata.data = (char *) (&ThisTimeLineID); ! rdata.len = sizeof(TimeLineID); ! rdata.next = NULL; ! ! /* ! * This is the only type of WAL message that can be inserted during ! * recovery. This ensures that we don't allow others to get access ! * until after we have changed state. ! */ ! RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata); ! ! InRecovery = false; ! ! return RecPtr; ! } ! ! /* ! * XLOG resource manager's routines. ! * ! * Definitions of message info are in include/catalog/pg_control.h, ! * though not all messages relate to control file processing. */ void xlog_redo(XLogRecPtr lsn, XLogRecord *record) *************** *** 6193,6213 **** ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; /* ! * TLI may change in a shutdown checkpoint, but it shouldn't decrease */ ! if (checkPoint.ThisTimeLineID != ThisTimeLineID) { ! if (checkPoint.ThisTimeLineID < ThisTimeLineID || !list_member_int(expectedTLIs, ! (int) checkPoint.ThisTimeLineID)) ereport(PANIC, ! (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", ! checkPoint.ThisTimeLineID, ThisTimeLineID))); /* Following WAL records should be run with new TLI */ ! ThisTimeLineID = checkPoint.ThisTimeLineID; } - - RecoveryRestartPoint(&checkPoint); } else if (info == XLOG_CHECKPOINT_ONLINE) { --- 6342,6379 ---- ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; /* ! * TLI no longer changes at shutdown checkpoint, since as of 8.4, ! * shutdown checkpoints only occur at shutdown. Much less confusing. */ ! ! RecoveryRestartPoint(&checkPoint); ! } ! else if (info == XLOG_RECOVERY_END) ! { ! TimeLineID tli; ! ! memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID)); ! ! /* ! * TLI may change when recovery ends, but it shouldn't decrease. ! * ! * This is the only WAL record that can tell us to change timelineID ! * while we process WAL records. ! * ! * We can *choose* to stop recovery at any point, generating a ! * new timelineID which is recorded using this record type. ! */ ! if (tli != ThisTimeLineID) { ! if (tli < ThisTimeLineID || !list_member_int(expectedTLIs, ! (int) tli)) ereport(PANIC, ! (errmsg("unexpected timeline ID %u (after %u) at recovery end record", ! tli, ThisTimeLineID))); /* Following WAL records should be run with new TLI */ ! ThisTimeLineID = tli; } } else if (info == XLOG_CHECKPOINT_ONLINE) { Index: src/backend/postmaster/bgwriter.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v retrieving revision 1.51 diff -c -r1.51 bgwriter.c *** src/backend/postmaster/bgwriter.c 11 Aug 2008 11:05:11 -0000 1.51 --- src/backend/postmaster/bgwriter.c 22 Sep 2008 20:58:59 -0000 *************** *** 49,54 **** --- 49,55 ---- #include <unistd.h> #include "access/xlog_internal.h" + #include "catalog/pg_control.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" *************** *** 130,135 **** --- 131,143 ---- int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ + /* + * When the Startup process wants bgwriter to perform a restartpoint, it + * sets these fields so that we can update the control file afterwards. + */ + XLogRecPtr ReadPtr; /* ReadRecPtr for RestartPoint request */ + CheckPoint *restartPoint; /* restartPoint data for ControlFile */ + uint32 num_backend_writes; /* counts non-bgwriter buffer writes */ int num_requests; /* current # of requests */ *************** *** 166,172 **** /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; ! static XLogRecPtr ckpt_start_recptr; static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; --- 174,180 ---- /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; ! static XLogRecPtr ckpt_start_recptr; /* not used if IsRecoveryProcessingMode */ static double ckpt_cached_elapsed; static pg_time_t last_checkpoint_time; *************** *** 198,203 **** --- 206,212 ---- { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; + bool BgWriterRecoveryMode; BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; *************** *** 356,371 **** */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { - bool do_checkpoint = false; - int flags = 0; - pg_time_t now; - int elapsed_secs; - /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. --- 365,381 ---- */ PG_SETMASK(&UnBlockSig); + BgWriterRecoveryMode = IsRecoveryProcessingMode(); + + if (BgWriterRecoveryMode) + elog(DEBUG1, "bgwriter starting during recovery, pid = %u", + BgWriterShmem->bgwriter_pid); + /* * Loop forever */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. *************** *** 383,501 **** got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } - if (checkpoint_requested) - { - checkpoint_requested = false; - do_checkpoint = true; - BgWriterStats.m_requested_checkpoints++; - } - if (shutdown_requested) - { - /* - * From here on, elog(ERROR) should end with exit(1), not send - * control back to the sigsetjmp block above - */ - ExitOnAnyError = true; - /* Close down the database */ - ShutdownXLOG(0, 0); - DumpFreeSpaceMap(0, 0); - /* Normal exit from the bgwriter is here */ - proc_exit(0); /* done */ - } ! /* ! * Force a checkpoint if too much time has elapsed since the last one. ! * Note that we count a timed checkpoint in stats only when this ! * occurs without an external request, but we set the CAUSE_TIME flag ! * bit even if there is also an external request. ! */ ! now = (pg_time_t) time(NULL); ! elapsed_secs = now - last_checkpoint_time; ! if (elapsed_secs >= CheckPointTimeout) { ! if (!do_checkpoint) ! BgWriterStats.m_timed_checkpoints++; ! do_checkpoint = true; ! flags |= CHECKPOINT_CAUSE_TIME; } ! ! /* ! * Do a checkpoint if requested, otherwise do one cycle of ! * dirty-buffer writing. ! */ ! if (do_checkpoint) { ! /* use volatile pointer to prevent code rearrangement */ ! volatile BgWriterShmemStruct *bgs = BgWriterShmem; /* ! * Atomically fetch the request flags to figure out what kind of a ! * checkpoint we should perform, and increase the started-counter ! * to acknowledge that we've started a new checkpoint. */ ! SpinLockAcquire(&bgs->ckpt_lck); ! flags |= bgs->ckpt_flags; ! bgs->ckpt_flags = 0; ! bgs->ckpt_started++; ! SpinLockRelease(&bgs->ckpt_lck); ! ! /* ! * We will warn if (a) too soon since last checkpoint (whatever ! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag ! * since the last checkpoint start. Note in particular that this ! * implementation will not generate warnings caused by ! * CheckPointTimeout < CheckPointWarning. ! */ ! if ((flags & CHECKPOINT_CAUSE_XLOG) && ! elapsed_secs < CheckPointWarning) ! ereport(LOG, ! (errmsg("checkpoints are occurring too frequently (%d seconds apart)", ! elapsed_secs), ! errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); ! ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_recptr = GetInsertRecPtr(); ! ckpt_start_time = now; ! ckpt_cached_elapsed = 0; ! ! /* ! * Do the checkpoint. ! */ ! CreateCheckPoint(flags); /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. */ ! smgrcloseall(); ! /* ! * Indicate checkpoint completion to any waiting backends. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! bgs->ckpt_done = bgs->ckpt_started; ! SpinLockRelease(&bgs->ckpt_lck); ! ! ckpt_active = false; ! /* ! * Note we record the checkpoint start time not end time as ! * last_checkpoint_time. This is so that time-driven checkpoints ! * happen at a predictable spacing. ! */ ! last_checkpoint_time = now; } - else - BgBufferSync(); - - /* Check for archive_timeout and switch xlog files if necessary. */ - CheckArchiveTimeout(); - - /* Nap for the configured time. */ - BgWriterNap(); } } --- 393,592 ---- got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } ! if (BgWriterRecoveryMode) { ! if (shutdown_requested) ! { ! /* ! * From here on, elog(ERROR) should end with exit(1), not send ! * control back to the sigsetjmp block above ! */ ! ExitOnAnyError = true; ! /* Normal exit from the bgwriter is here */ ! proc_exit(0); /* done */ ! } ! ! if (!IsRecoveryProcessingMode()) ! { ! elog(DEBUG2, "bgwriter changing from recovery to normal mode"); ! ! InitXLOGAccess(); ! BgWriterRecoveryMode = false; ! ! /* ! * Start time-driven events from now ! */ ! last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); ! ! continue; ! } ! ! if (checkpoint_requested) ! { ! XLogRecPtr ReadPtr; ! CheckPoint restartPoint; ! ! checkpoint_requested = false; ! ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_time = (pg_time_t) time(NULL); ! ckpt_cached_elapsed = 0; ! ! /* ! * Get the requested values from shared memory that the ! * Startup process has put there for us. ! */ ! SpinLockAcquire(&BgWriterShmem->ckpt_lck); ! ReadPtr = BgWriterShmem->ReadPtr; ! memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint)); ! SpinLockRelease(&BgWriterShmem->ckpt_lck); ! ! CreateRestartPoint(ReadPtr, &restartPoint); ! ! ckpt_active = false; ! ! /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. ! */ ! smgrcloseall(); ! } ! else ! { ! /* Clean buffers dirtied by recovery */ ! BgBufferSync(true); ! ! /* Nap for the configured time. */ ! BgWriterNap(); ! } } ! else /* Normal processing */ { ! bool do_checkpoint = false; ! int flags = 0; ! pg_time_t now; ! int elapsed_secs; ! ! Assert(!IsRecoveryProcessingMode()); ! ! if (checkpoint_requested) ! { ! checkpoint_requested = false; ! do_checkpoint = true; ! BgWriterStats.m_requested_checkpoints++; ! } ! if (shutdown_requested) ! { ! /* ! * From here on, elog(ERROR) should end with exit(1), not send ! * control back to the sigsetjmp block above ! */ ! ExitOnAnyError = true; ! /* Close down the database */ ! ShutdownXLOG(0, 0); ! DumpFreeSpaceMap(0, 0); ! /* Normal exit from the bgwriter is here */ ! proc_exit(0); /* done */ ! } /* ! * Force a checkpoint if too much time has elapsed since the last one. ! * Note that we count a timed checkpoint in stats only when this ! * occurs without an external request, but we set the CAUSE_TIME flag ! * bit even if there is also an external request. */ ! now = (pg_time_t) time(NULL); ! elapsed_secs = now - last_checkpoint_time; ! if (elapsed_secs >= CheckPointTimeout) ! { ! if (!do_checkpoint) ! BgWriterStats.m_timed_checkpoints++; ! do_checkpoint = true; ! flags |= CHECKPOINT_CAUSE_TIME; ! } /* ! * Do a checkpoint if requested, otherwise do one cycle of ! * dirty-buffer writing. */ ! if (do_checkpoint) ! { ! /* use volatile pointer to prevent code rearrangement */ ! volatile BgWriterShmemStruct *bgs = BgWriterShmem; ! ! /* ! * Atomically fetch the request flags to figure out what kind of a ! * checkpoint we should perform, and increase the started-counter ! * to acknowledge that we've started a new checkpoint. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! flags |= bgs->ckpt_flags; ! bgs->ckpt_flags = 0; ! bgs->ckpt_started++; ! SpinLockRelease(&bgs->ckpt_lck); ! ! /* ! * We will warn if (a) too soon since last checkpoint (whatever ! * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag ! * since the last checkpoint start. Note in particular that this ! * implementation will not generate warnings caused by ! * CheckPointTimeout < CheckPointWarning. ! */ ! if ((flags & CHECKPOINT_CAUSE_XLOG) && ! elapsed_secs < CheckPointWarning) ! ereport(LOG, ! (errmsg("checkpoints are occurring too frequently (%d seconds apart)", ! elapsed_secs), ! errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); ! ! /* ! * Initialize bgwriter-private variables used during checkpoint. ! */ ! ckpt_active = true; ! ckpt_start_recptr = GetInsertRecPtr(); ! ckpt_start_time = now; ! ckpt_cached_elapsed = 0; ! ! /* ! * Do the checkpoint. ! */ ! CreateCheckPoint(flags); ! ! /* ! * After any checkpoint, close all smgr files. This is so we ! * won't hang onto smgr references to deleted files indefinitely. ! */ ! smgrcloseall(); ! ! /* ! * Indicate checkpoint completion to any waiting backends. ! */ ! SpinLockAcquire(&bgs->ckpt_lck); ! bgs->ckpt_done = bgs->ckpt_started; ! SpinLockRelease(&bgs->ckpt_lck); ! ! ckpt_active = false; ! ! /* ! * Note we record the checkpoint start time not end time as ! * last_checkpoint_time. This is so that time-driven checkpoints ! * happen at a predictable spacing. ! */ ! last_checkpoint_time = now; ! } ! else ! BgBufferSync(false); ! /* Check for archive_timeout and switch xlog files if necessary. */ ! CheckArchiveTimeout(); ! /* Nap for the configured time. */ ! BgWriterNap(); } } } *************** *** 588,594 **** (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); ! AbsorbFsyncRequests(); udelay -= 1000000L; } --- 679,686 ---- (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)) break; pg_usleep(1000000L); ! if (!IsRecoveryProcessingMode()) ! AbsorbFsyncRequests(); udelay -= 1000000L; } *************** *** 642,647 **** --- 734,751 ---- if (!am_bg_writer) return; + /* Perform minimal duties during recovery and skip wait if requested */ + if (IsRecoveryProcessingMode()) + { + BgBufferSync(true); + + if (!shutdown_requested && + IsCheckpointOnSchedule(progress)) + BgWriterNap(); + + return; + } + /* * Perform the usual bgwriter duties and take a nap, unless we're behind * schedule, in which case we just try to catch up as quickly as possible. *************** *** 660,666 **** AbsorbFsyncRequests(); absorb_counter = WRITES_PER_ABSORB; ! BgBufferSync(); CheckArchiveTimeout(); BgWriterNap(); } --- 764,770 ---- AbsorbFsyncRequests(); absorb_counter = WRITES_PER_ABSORB; ! BgBufferSync(false); CheckArchiveTimeout(); BgWriterNap(); } *************** *** 716,731 **** * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ ! recptr = GetInsertRecPtr(); ! elapsed_xlogs = ! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + ! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / ! CheckPointSegments; ! ! if (progress < elapsed_xlogs) { ! ckpt_cached_elapsed = elapsed_xlogs; ! return false; } /* --- 820,838 ---- * However, it's good enough for our purposes, we're only calculating an * estimate anyway. */ ! if (!IsRecoveryProcessingMode()) { ! recptr = GetInsertRecPtr(); ! elapsed_xlogs = ! (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile + ! ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) / ! CheckPointSegments; ! ! if (progress < elapsed_xlogs) ! { ! ckpt_cached_elapsed = elapsed_xlogs; ! return false; ! } } /* *************** *** 967,972 **** --- 1074,1109 ---- } /* + * Always runs in Startup process (see xlog.c) + */ + void + RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter) + { + /* + * Should we just do it ourselves? + */ + if (!IsPostmasterEnvironment || !sendToBGWriter) + { + CreateRestartPoint(ReadPtr, restartPoint); + return; + } + + /* + * Push requested values into shared memory, then signal to request restartpoint. + */ + if (BgWriterShmem->bgwriter_pid == 0) + elog(LOG, "could not request restartpoint because bgwriter not running"); + + SpinLockAcquire(&BgWriterShmem->ckpt_lck); + BgWriterShmem->ReadPtr = ReadPtr; + memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint)); + SpinLockRelease(&BgWriterShmem->ckpt_lck); + + if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0) + elog(LOG, "could not signal for restartpoint: %m"); + } + + /* * ForwardFsyncRequest * Forward a file-fsync request from a backend to the bgwriter * Index: src/backend/postmaster/postmaster.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v retrieving revision 1.561 diff -c -r1.561 postmaster.c *** src/backend/postmaster/postmaster.c 26 Jun 2008 02:47:19 -0000 1.561 --- src/backend/postmaster/postmaster.c 22 Sep 2008 20:25:36 -0000 *************** *** 254,259 **** --- 254,264 ---- { PM_INIT, /* postmaster starting */ PM_STARTUP, /* waiting for startup subprocess */ + PM_RECOVERY, /* consistent recovery mode; state only + * entered for archive and streaming recovery, + * and only after the point where the + * all data is in consistent state. + */ PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKUP, /* waiting for online backup mode to end */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ *************** *** 1294,1300 **** * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ ! if (BgWriterPID == 0 && pmState == PM_RUN) BgWriterPID = StartBackgroundWriter(); /* --- 1299,1305 ---- * state that prevents it, start one. It doesn't matter if this * fails, we'll just try again later. */ ! if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY)) BgWriterPID = StartBackgroundWriter(); /* *************** *** 2104,2110 **** if (pid == StartupPID) { StartupPID = 0; ! Assert(pmState == PM_STARTUP); /* FATAL exit of startup is treated as catastrophic */ if (!EXIT_STATUS_0(exitstatus)) --- 2109,2115 ---- if (pid == StartupPID) { StartupPID = 0; ! Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY); /* FATAL exit of startup is treated as catastrophic */ if (!EXIT_STATUS_0(exitstatus)) *************** *** 2145,2155 **** load_role(); /* ! * Crank up the background writer. It doesn't matter if this ! * fails, we'll just try again later. */ ! Assert(BgWriterPID == 0); ! BgWriterPID = StartBackgroundWriter(); /* * Likewise, start other special children as needed. In a restart --- 2150,2160 ---- load_role(); /* ! * Check whether we need to start background writer, if not ! * already running. */ ! if (BgWriterPID == 0) ! BgWriterPID = StartBackgroundWriter(); /* * Likewise, start other special children as needed. In a restart *************** *** 3821,3826 **** --- 3826,3876 ---- PG_SETMASK(&BlockSig); + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START)) + { + Assert(pmState == PM_STARTUP); + + /* + * Go to shutdown mode if a shutdown request was pending. + */ + if (Shutdown > NoShutdown) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + } + else + { + /* + * Startup process has entered recovery + */ + pmState = PM_RECOVERY; + + /* + * Load the flat authorization file into postmaster's cache. The + * startup process won't have recomputed this from the database yet, + * so we it may change following recovery. + */ + load_role(); + + /* + * Crank up the background writer. It doesn't matter if this + * fails, we'll just try again later. + */ + Assert(BgWriterPID == 0); + BgWriterPID = StartBackgroundWriter(); + + /* + * Likewise, start other special children as needed. + */ + Assert(PgStatPID == 0); + PgStatPID = pgstat_start(); + + /* XXX at this point we could accept read-only connections */ + ereport(DEBUG1, + (errmsg("database system is in consistent recovery mode"))); + } + } + if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE)) { /* Index: src/backend/storage/buffer/README =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/README,v retrieving revision 1.14 diff -c -r1.14 README *** src/backend/storage/buffer/README 21 Mar 2008 13:23:28 -0000 1.14 --- src/backend/storage/buffer/README 22 Sep 2008 21:54:20 -0000 *************** *** 264,266 **** --- 264,271 ---- This ensures that the page image transferred to disk is reasonably consistent. We might miss a hint-bit update or two but that isn't a problem, for the same reasons mentioned under buffer access rules. + + As of 8.4, background writer starts during recovery mode when there is + some form of potentially extended recovery to perform. It performs an + identical service to normal processing, except that checkpoints it + writes are technically restartpoints nor needs to flush WAL for dirty buffers. Index: src/backend/storage/buffer/bufmgr.c =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/bufmgr.c,v retrieving revision 1.237 diff -c -r1.237 bufmgr.c *** src/backend/storage/buffer/bufmgr.c 11 Aug 2008 11:05:11 -0000 1.237 --- src/backend/storage/buffer/bufmgr.c 22 Sep 2008 21:01:02 -0000 *************** *** 1080,1086 **** * * This is called at checkpoint time to write out all dirty shared buffers. * The checkpoint request flags should be passed in; currently the only one ! * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. */ static void BufferSync(int flags) --- 1080,1087 ---- * * This is called at checkpoint time to write out all dirty shared buffers. * The checkpoint request flags should be passed in; currently the only one ! * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes, ! * except for restartpoints, which use CHECKPOINT_RESTARTPOINT. */ static void BufferSync(int flags) *************** *** 1089,1094 **** --- 1090,1096 ---- int num_to_scan; int num_to_write; int num_written; + bool cleanup_and_exit = false; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); *************** *** 1163,1169 **** */ if (bufHdr->flags & BM_CHECKPOINT_NEEDED) { ! if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); BgWriterStats.m_buf_written_checkpoints++; --- 1165,1180 ---- */ if (bufHdr->flags & BM_CHECKPOINT_NEEDED) { ! if (cleanup_and_exit) ! { ! LockBufHdr(bufHdr); ! ! if (bufHdr->flags & BM_CHECKPOINT_NEEDED) ! bufHdr->flags &= ~BM_CHECKPOINT_NEEDED; ! ! UnlockBufHdr(bufHdr); ! } ! else if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); BgWriterStats.m_buf_written_checkpoints++; *************** *** 1194,1199 **** --- 1205,1218 ---- if (++buf_id >= NBuffers) buf_id = 0; + + /* + * Quit scanning if state changes while we're here. If it + * does we want to stop performing the current restartpoint + * as quickly as possible. We'll write a real checkpoint instead. + */ + if (flags & CHECKPOINT_RESTARTPOINT && !IsRecoveryProcessingMode()) + cleanup_and_exit = true; } TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write); *************** *** 1211,1217 **** * This is called periodically by the background writer process. */ void ! BgBufferSync(void) { /* info obtained from freelist.c */ int strategy_buf_id; --- 1230,1236 ---- * This is called periodically by the background writer process. */ void ! BgBufferSync(bool calledInRecovery) { /* info obtained from freelist.c */ int strategy_buf_id; *************** *** 1423,1428 **** --- 1442,1456 ---- { int buffer_state = SyncOneBuffer(next_to_clean, true); + /* + * Quit scanning if state changes while we're here, otherwise + * we might skip writing WAL for a buffer that was modified + * by a transaction that committed very soon after recovery + * mode completes. + */ + if (calledInRecovery && !IsRecoveryProcessingMode()) + break; + if (++next_to_clean >= NBuffers) { next_to_clean = 0; Index: src/include/access/xlog.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v retrieving revision 1.88 diff -c -r1.88 xlog.h *** src/include/access/xlog.h 12 May 2008 08:35:05 -0000 1.88 --- src/include/access/xlog.h 22 Sep 2008 19:04:56 -0000 *************** *** 133,139 **** } XLogRecData; extern TimeLineID ThisTimeLineID; /* current TLI */ ! extern bool InRecovery; extern XLogRecPtr XactLastRecEnd; /* these variables are GUC parameters related to XLOG */ --- 133,148 ---- } XLogRecData; extern TimeLineID ThisTimeLineID; /* current TLI */ ! ! /* ! * Prior to 8.4, all activity during recovery were carried out by Startup ! * process. This local variable continues to be used in many parts of the ! * code to indicate actions taken by RecoveryManagers. Other processes who ! * potentially perform work during recovery should check ! * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c ! */ ! extern bool InRecovery; ! extern XLogRecPtr XactLastRecEnd; /* these variables are GUC parameters related to XLOG */ *************** *** 166,171 **** --- 175,181 ---- /* These indicate the cause of a checkpoint request */ #define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */ #define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */ + #define CHECKPOINT_RESTARTPOINT 0x0040 /* Restartpoint during recovery */ /* Checkpoint statistics */ typedef struct CheckpointStatsData *************** *** 197,202 **** --- 207,214 ---- extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); + extern bool IsRecoveryProcessingMode(void); + extern void UpdateControlFile(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); Index: src/include/access/xlog_internal.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog_internal.h,v retrieving revision 1.24 diff -c -r1.24 xlog_internal.h *** src/include/access/xlog_internal.h 11 Aug 2008 11:05:11 -0000 1.24 --- src/include/access/xlog_internal.h 22 Sep 2008 20:26:56 -0000 *************** *** 17,22 **** --- 17,23 ---- #define XLOG_INTERNAL_H #include "access/xlog.h" + #include "catalog/pg_control.h" #include "fmgr.h" #include "pgtime.h" #include "storage/block.h" *************** *** 245,250 **** --- 246,253 ---- extern pg_time_t GetLastSegSwitchTime(void); extern XLogRecPtr RequestXLogSwitch(void); + extern void CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint); + /* * These aren't in xlog.h because I'd rather not include fmgr.h there. */ Index: src/include/catalog/pg_control.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/catalog/pg_control.h,v retrieving revision 1.41 diff -c -r1.41 pg_control.h *** src/include/catalog/pg_control.h 21 Apr 2008 00:26:47 -0000 1.41 --- src/include/catalog/pg_control.h 22 Sep 2008 19:21:37 -0000 *************** *** 46,52 **** #define XLOG_NOOP 0x20 #define XLOG_NEXTOID 0x30 #define XLOG_SWITCH 0x40 ! /* System status indicator */ typedef enum DBState --- 46,52 ---- #define XLOG_NOOP 0x20 #define XLOG_NEXTOID 0x30 #define XLOG_SWITCH 0x40 ! #define XLOG_RECOVERY_END 0x50 /* System status indicator */ typedef enum DBState Index: src/include/postmaster/bgwriter.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v retrieving revision 1.12 diff -c -r1.12 bgwriter.h *** src/include/postmaster/bgwriter.h 11 Aug 2008 11:05:11 -0000 1.12 --- src/include/postmaster/bgwriter.h 22 Sep 2008 15:53:22 -0000 *************** *** 12,17 **** --- 12,18 ---- #ifndef _BGWRITER_H #define _BGWRITER_H + #include "catalog/pg_control.h" #include "storage/block.h" #include "storage/relfilenode.h" *************** *** 25,30 **** --- 26,32 ---- extern void BackgroundWriterMain(void); extern void RequestCheckpoint(int flags); + extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter); extern void CheckpointWriteDelay(int flags, double progress); extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, Index: src/include/storage/bufmgr.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/bufmgr.h,v retrieving revision 1.115 diff -c -r1.115 bufmgr.h *** src/include/storage/bufmgr.h 11 Aug 2008 11:05:11 -0000 1.115 --- src/include/storage/bufmgr.h 22 Sep 2008 18:36:54 -0000 *************** *** 193,199 **** extern void AbortBufferIO(void); extern void BufmgrCommit(void); ! extern void BgBufferSync(void); extern void AtProcExit_LocalBuffers(void); --- 193,199 ---- extern void AbortBufferIO(void); extern void BufmgrCommit(void); ! extern void BgBufferSync(bool calledInRecovery); extern void AtProcExit_LocalBuffers(void); Index: src/include/storage/pmsignal.h =================================================================== RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v retrieving revision 1.20 diff -c -r1.20 pmsignal.h *** src/include/storage/pmsignal.h 19 Jun 2008 21:32:56 -0000 1.20 --- src/include/storage/pmsignal.h 22 Sep 2008 15:54:57 -0000 *************** *** 22,27 **** --- 22,28 ---- */ typedef enum { + PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */ PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */ PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */ PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers