Re: [PATCHES] [HACKERS] Infrastructure changes for recovery

Simon Riggs Tue, 23 Sep 2008 08:42:49 -0700

On Mon, 2008-09-22 at 23:06 +0100, Simon Riggs wrote:
> On Thu, 2008-09-18 at 10:09 -0400, Tom Lane wrote:
> > Simon Riggs <[EMAIL PROTECTED]> writes:
> > > On Thu, 2008-09-18 at 09:06 -0400, Tom Lane wrote:
> > >> Do we really need a checkpoint there at all?
> > 
> > > "Timelines only change at shutdown checkpoints".
> > 
> > Hmm.  I *think* that that is just a debugging crosscheck rather than a
> > critical property.  But yeah, it would take some close investigation,
> > which maybe isn't warranted if you have a less-invasive solution.
> 
> OK, new patch, version 6. Some major differences to previous patch.


> Ready for serious review prior to commit. I will be performing further
> testing also.

Version 7

I've removed the concept of interrupting a restartpoint half way
through, I found a fault there. It was more ugly than the alternative
and less robust. The code now waits at the end of recovery if we are in
the middle of a restartpoint, but forces a do-it-more-quickly also. That
means we won't always get a fast start even though we skip the shutdown
checkpoint, but at least we're sure there's no chance of breakage
because of concurrent activiy, state changes etc..

I'm happy with this now.

-- 
 Simon Riggs           www.2ndQuadrant.com
 PostgreSQL Training, Services and Support

Index: src/backend/access/transam/multixact.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/multixact.c,v
retrieving revision 1.28
diff -c -r1.28 multixact.c
*** src/backend/access/transam/multixact.c	1 Aug 2008 13:16:08 -0000	1.28
--- src/backend/access/transam/multixact.c	22 Sep 2008 19:28:56 -0000
***************
*** 1543,1549 ****
  	 * SimpleLruTruncate would get confused.  It seems best not to risk
  	 * removing any data during recovery anyway, so don't truncate.
  	 */
! 	if (!InRecovery)
  		TruncateMultiXact();
  
  	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
--- 1543,1549 ----
  	 * SimpleLruTruncate would get confused.  It seems best not to risk
  	 * removing any data during recovery anyway, so don't truncate.
  	 */
! 	if (!IsRecoveryProcessingMode())
  		TruncateMultiXact();
  
  	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.317
diff -c -r1.317 xlog.c
*** src/backend/access/transam/xlog.c	11 Aug 2008 11:05:10 -0000	1.317
--- src/backend/access/transam/xlog.c	23 Sep 2008 14:56:37 -0000
***************
*** 66,76 ****
  bool		fullPageWrites = true;
  bool		log_checkpoints = false;
  int 		sync_method = DEFAULT_SYNC_METHOD;
- 
  #ifdef WAL_DEBUG
  bool		XLOG_DEBUG = false;
  #endif
- 
  /*
   * XLOGfileslop is the maximum number of preallocated future XLOG segments.
   * When we are done with an old XLOG segment file, we will recycle it as a
--- 66,74 ----
***************
*** 119,124 ****
--- 117,123 ----
  
  /* Are we doing recovery from XLOG? */
  bool		InRecovery = false;
+ bool		reachedSafeStopPoint = false;
  
  /* Are we recovering using offline XLOG archives? */
  static bool InArchiveRecovery = false;
***************
*** 131,137 ****
  static bool recoveryTarget = false;
  static bool recoveryTargetExact = false;
  static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
  static TimestampTz recoveryLastXTime = 0;
--- 130,135 ----
***************
*** 286,295 ****
--- 284,295 ----
  /*
   * Total shared-memory state for XLOG.
   */
+ #define	XLOGCTL_BUFFER_SPACING	128
  typedef struct XLogCtlData
  {
  	/* Protected by WALInsertLock: */
  	XLogCtlInsert Insert;
+ 	char	InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)];
  
  	/* Protected by info_lck: */
  	XLogwrtRqst LogwrtRqst;
***************
*** 297,305 ****
--- 297,312 ----
  	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
  	TransactionId ckptXid;
  	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
+ 	/* add data structure padding for above info_lck declarations */
+ 	char	InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst) 
+ 												- sizeof(XLogwrtResult)
+ 												- sizeof(uint32)
+ 												- sizeof(TransactionId)
+ 												- sizeof(XLogRecPtr)];
  
  	/* Protected by WALWriteLock: */
  	XLogCtlWrite Write;
+ 	char	WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)];
  
  	/*
  	 * These values do not change after startup, although the pointed-to pages
***************
*** 311,316 ****
--- 318,342 ----
  	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
  	TimeLineID	ThisTimeLineID;
  
+ 	/*
+ 	 * IsRecoveryProcessingMode shows whether the postmaster is in a
+ 	 * postmaster state earlier than PM_RUN, or not. This is a globally
+ 	 * accessible state to allow EXEC_BACKEND case.
+ 	 *
+ 	 * We also retain a local state variable InRecovery. InRecovery=true
+ 	 * means the code is being executed by Startup process and therefore
+ 	 * always during RecoveryProcessingMode. This allows us to retain the
+ 	 * often important distinction between code executed *during* 
+ 	 * RecoveryProcessingMode and but not necessarily by Startup process.
+ 	 *
+ 	 * Reviewer's note: all call points InRecovery and InRedo have been checked
+ 	 * for correctness and have been changed to IsRecoveryProcessingMode()
+ 	 * if appropriate.
+ 	 */
+ 	bool		IsRecoveryProcessingMode;
+ 
+ 	char		InfoLockPadding[XLOGCTL_BUFFER_SPACING];
+ 
  	slock_t		info_lck;		/* locks shared variables shown above */
  } XLogCtlData;
  
***************
*** 396,401 ****
--- 422,428 ----
  static void readRecoveryCommandFile(void);
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
+ static XLogRecPtr exitRecovery(void);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
  static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
  
***************
*** 479,484 ****
--- 506,516 ----
  	bool		updrqst;
  	bool		doPageWrites;
  	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ 	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+ 
+ 	/* cross-check on whether we should be here or not */
+ 	if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+ 		elog(FATAL, "cannot make new WAL entries during recovery");
  
  	/* info's high bits are reserved for use by me */
  	if (info & XLR_INFO_MASK)
***************
*** 1677,1684 ****
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	/* Disabled during REDO */
! 	if (InRedo)
  		return;
  
  	/* Quick exit if already known flushed */
--- 1709,1715 ----
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	if (IsRecoveryProcessingMode())
  		return;
  
  	/* Quick exit if already known flushed */
***************
*** 1766,1774 ****
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while InRedo is true, but if the bad page is brought in
! 	 * and marked dirty during recovery then CreateCheckPoint will try to
! 	 * flush it at the end of recovery.)
  	 *
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
--- 1797,1805 ----
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
! 	 * brought in and marked dirty during recovery then CreateCheckPoint will
! 	 * try to flush it at the end of recovery.)
  	 *
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
***************
*** 2051,2057 ****
  		unlink(tmppath);
  	}
  
! 	elog(DEBUG2, "done creating and filling new WAL file");
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
--- 2082,2089 ----
  		unlink(tmppath);
  	}
  
! 	XLogFileName(tmppath, ThisTimeLineID, log, seg);
! 	elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
***************
*** 4532,4546 ****
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
- 			/*
- 			 * does nothing if a recovery_target is not also set
- 			 */
- 			if (!parse_bool(tok2, &recoveryLogRestartpoints))
- 				  ereport(ERROR,
- 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
  			ereport(LOG,
! 					(errmsg("log_restartpoints = %s", tok2)));
  		}
  		else
  			ereport(FATAL,
--- 4564,4572 ----
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
  			ereport(LOG,
! 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 					  errmsg("parameter \"log_restartpoints\" has been deprecated")));
  		}
  		else
  			ereport(FATAL,
***************
*** 4811,4828 ****
  	CheckPoint	checkPoint;
  	bool		wasShutdown;
  	bool		reachedStopPoint = false;
  	bool		haveBackupLabel = false;
  	XLogRecPtr	RecPtr,
  				LastRec,
  				checkPointLoc,
  				minRecoveryLoc,
! 				EndOfLog;
  	uint32		endLogId;
  	uint32		endLogSeg;
  	XLogRecord *record;
  	uint32		freespace;
  	TransactionId oldestActiveXID;
  
  	/*
  	 * Read control file and check XLOG status looks valid.
  	 *
--- 4837,4858 ----
  	CheckPoint	checkPoint;
  	bool		wasShutdown;
  	bool		reachedStopPoint = false;
+ 	bool		performedArchiveRecovery = false;
  	bool		haveBackupLabel = false;
  	XLogRecPtr	RecPtr,
  				LastRec,
  				checkPointLoc,
  				minRecoveryLoc,
! 				EndOfLog,
! 				RecoveryCompletionPtr;
  	uint32		endLogId;
  	uint32		endLogSeg;
  	XLogRecord *record;
  	uint32		freespace;
  	TransactionId oldestActiveXID;
  
+ 	XLogCtl->IsRecoveryProcessingMode = true;
+ 
  	/*
  	 * Read control file and check XLOG status looks valid.
  	 *
***************
*** 5039,5044 ****
--- 5069,5079 ----
  		UpdateControlFile();
  
  		/*
+ 		 * Reset pgstat data, because it may be invalid after recovery.
+ 		 */
+ 		pgstat_reset_all();
+ 
+ 		/*
  		 * If there was a backup label file, it's done its job and the info
  		 * has now been propagated into pg_control.  We must get rid of the
  		 * label file so that if we crash during recovery, we'll pick up at
***************
*** 5148,5153 ****
--- 5183,5203 ----
  
  				LastRec = ReadRecPtr;
  
+ 				/*
+ 				 * Have we reached our safe stopping point? If so, we can
+ 				 * signal Postmaster to enter consistent recovery mode
+ 				 */
+ 				if (!reachedSafeStopPoint && 
+ 					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ 				{
+ 					reachedSafeStopPoint = true;
+ 					ereport(LOG,
+ 						(errmsg("consistent recovery state reached at %X/%X",
+ 							EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ 					if (IsUnderPostmaster)
+ 						SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ 				}
+ 
  				record = ReadRecord(NULL, LOG);
  			} while (record != NULL && recoveryContinue);
  
***************
*** 5169,5174 ****
--- 5219,5225 ----
  			/* there are no WAL records following the checkpoint */
  			ereport(LOG,
  					(errmsg("redo is not required")));
+ 			reachedSafeStopPoint = true;
  		}
  	}
  
***************
*** 5184,5190 ****
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
--- 5235,5241 ----
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (InRecovery && !reachedSafeStopPoint)
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
***************
*** 5227,5233 ****
--- 5278,5287 ----
  	 * we will use that below.)
  	 */
  	if (InArchiveRecovery)
+ 	{
+ 		performedArchiveRecovery = true;
  		exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
+ 	}
  
  	/*
  	 * Prepare to write WAL starting at EndOfLog position, and init xlog
***************
*** 5286,5291 ****
--- 5340,5347 ----
  	/* Pre-scan prepared transactions to find out the range of XIDs present */
  	oldestActiveXID = PrescanPreparedTransactions();
  
+ 	RecoveryCompletionPtr = EndOfLog;
+ 
  	if (InRecovery)
  	{
  		int			rmid;
***************
*** 5306,5343 ****
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Reset pgstat data, because it may be invalid after recovery.
  		 */
! 		pgstat_reset_all();
! 
! 		/*
! 		 * Perform a checkpoint to update all our recovery activity to disk.
! 		 *
! 		 * Note that we write a shutdown checkpoint rather than an on-line
! 		 * one. This is not particularly critical, but since we may be
! 		 * assigning a new TLI, using a shutdown checkpoint allows us to have
! 		 * the rule that TLI only changes in shutdown checkpoints, which
! 		 * allows some extra error checking in xlog_redo.
! 		 */
! 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
  	}
  
- 	/*
- 	 * Preallocate additional log files, if wanted.
- 	 */
- 	PreallocXlogFiles(EndOfLog);
- 
- 	/*
- 	 * Okay, we're officially UP.
- 	 */
- 	InRecovery = false;
- 
- 	ControlFile->state = DB_IN_PRODUCTION;
- 	ControlFile->time = (pg_time_t) time(NULL);
- 	UpdateControlFile();
- 
  	/* start the archive_timeout timer running */
! 	XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
  
  	/* initialize shared-memory copy of latest checkpoint XID/epoch */
  	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
--- 5362,5375 ----
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
! 		 * a shutdown checkpoint here, but we ask bgwriter to do that now.
  		 */
! 		RecoveryCompletionPtr = exitRecovery();
  	}
  
  	/* start the archive_timeout timer running */
! 	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
  
  	/* initialize shared-memory copy of latest checkpoint XID/epoch */
  	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
***************
*** 5372,5377 ****
--- 5404,5446 ----
  		readRecordBuf = NULL;
  		readRecordBufSize = 0;
  	}
+ 
+ 	/*
+ 	 * Okay, we're officially UP.
+ 	 */
+ 	XLogCtl->IsRecoveryProcessingMode = false;
+ 
+ 	/*
+ 	 * If we had to perform archive recovery we don't mark the control file,
+ 	 * yet, since we haven't definitely got a safe point to recover from that
+ 	 * doesn't rely on archived WAL files. So we switch quickly into normal 
+ 	 * processing and rely on the bgwriter's checkpoint (NOT a restartpoint)
+ 	 * to define a safe recovery point and put us into full production state. 
+ 	 * We specifically do not want to wait for checkpoint completion here,
+ 	 * so we can reduce startup time in a standby mode replication failover.
+ 	 * The checkpoint creation will also flush WAL, so we wait for that
+ 	 * otherwise we may need to prepare WAL files ourselves.
+ 	 *
+ 	 * If we are doing crash recovery, we know we have WAL files accessible
+ 	 * so we just get started again as quickly as possible.
+ 	 */
+ 	if (performedArchiveRecovery)
+ 		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ 	else
+ 	{
+ 		XLogFlush(RecoveryCompletionPtr);
+ 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ 		ControlFile->state = DB_IN_PRODUCTION;
+ 		ControlFile->time = (pg_time_t) time(NULL);
+ 		UpdateControlFile();
+ 		LWLockRelease(ControlFileLock);
+ 	}
+ }
+ 
+ bool
+ IsRecoveryProcessingMode(void)
+ {
+ 	return XLogCtl->IsRecoveryProcessingMode;
  }
  
  /*
***************
*** 5629,5648 ****
  static void
  LogCheckpointStart(int flags)
  {
! 	elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
! 		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
! 		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
! 		 (flags & CHECKPOINT_FORCE) ? " force" : "",
! 		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
! 		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
! 		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
  }
  
  /*
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(void)
  {
  	long		write_secs,
  				sync_secs,
--- 5698,5721 ----
  static void
  LogCheckpointStart(int flags)
  {
! 	if (flags & CHECKPOINT_RESTARTPOINT)
! 		elog(LOG, "restartpoint starting:%s",
! 			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
! 	else
! 		elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
! 			 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
! 			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
! 			 (flags & CHECKPOINT_FORCE) ? " force" : "",
! 			 (flags & CHECKPOINT_WAIT) ? " wait" : "",
! 			 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
! 			 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
  }
  
  /*
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(int flags)
  {
  	long		write_secs,
  				sync_secs,
***************
*** 5665,5681 ****
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! 		 "%d transaction log file(s) added, %d removed, %d recycled; "
! 		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 		 CheckpointStats.ckpt_bufs_written,
! 		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 		 CheckpointStats.ckpt_segs_added,
! 		 CheckpointStats.ckpt_segs_removed,
! 		 CheckpointStats.ckpt_segs_recycled,
! 		 write_secs, write_usecs / 1000,
! 		 sync_secs, sync_usecs / 1000,
! 		 total_secs, total_usecs / 1000);
  }
  
  /*
--- 5738,5763 ----
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	if (flags & CHECKPOINT_RESTARTPOINT)
! 		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 			 CheckpointStats.ckpt_bufs_written,
! 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 			 write_secs, write_usecs / 1000,
! 			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
! 	else
! 		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! 			 "%d transaction log file(s) added, %d removed, %d recycled; "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 			 CheckpointStats.ckpt_bufs_written,
! 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 			 CheckpointStats.ckpt_segs_added,
! 			 CheckpointStats.ckpt_segs_removed,
! 			 CheckpointStats.ckpt_segs_recycled,
! 			 write_secs, write_usecs / 1000,
! 			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
  }
  
  /*
***************
*** 5944,5949 ****
--- 6026,6033 ----
  	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	if (shutdown)
  		ControlFile->state = DB_SHUTDOWNED;
+ 	else
+ 		ControlFile->state = DB_IN_PRODUCTION;
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
  	ControlFile->checkPoint = ProcLastRecPtr;
  	ControlFile->checkPointCopy = checkPoint;
***************
*** 6002,6008 ****
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd();
  
  	LWLockRelease(CheckpointLock);
  }
--- 6086,6092 ----
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd(flags);
  
  	LWLockRelease(CheckpointLock);
  }
***************
*** 6071,6099 ****
  			}
  	}
  
! 	/*
! 	 * OK, force data out to disk
! 	 */
! 	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
  
  	/*
! 	 * Update pg_control so that any subsequent crash will restart from this
! 	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
! 	 * record itself.
  	 */
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
! 	ControlFile->checkPoint = ReadRecPtr;
! 	ControlFile->checkPointCopy = *checkPoint;
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
  
! 	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
! 					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! 	if (recoveryLastXTime)
! 		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! 				(errmsg("last completed transaction was at log time %s",
! 						timestamptz_to_str(recoveryLastXTime))));
  }
  
  /*
--- 6155,6213 ----
  			}
  	}
  
! 	if (recoveryLastXTime)
! 		ereport((log_checkpoints ? LOG : DEBUG2),
! 				(errmsg("last completed transaction was at log time %s",
! 						timestamptz_to_str(recoveryLastXTime))));
! 
! 	RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStopPoint);
! }
! 
! /*
!  * As of 8.4, RestartPoints are always created by the bgwriter
!  * once we have reachedSafeStopPoint. We use bgwriter's shared memory
!  * area wherever we call it from, to keep better code structure.
!  */
! void
! CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags)
! {
! 	if (log_checkpoints)
! 	{
! 		/*
! 		 * Prepare to accumulate statistics.
! 		 */
! 
! 		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
! 		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
! 
! 		LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags);
! 	}
! 
! 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
! 
! 	CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags);
  
  	/*
! 	 * Update pg_control, using current time
  	 */
+ 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
! 	ControlFile->checkPoint = ReadPtr;
! 	ControlFile->checkPointCopy = *restartPoint;
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
+ 	LWLockRelease(ControlFileLock);
  
! 	/* All real work is done, but log before releasing lock. */
! 	if (log_checkpoints)
! 		LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
! 
! 	ereport((log_checkpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
! 					restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
! 
! 	LWLockRelease(CheckpointLock);
! 
  }
  
  /*
***************
*** 6158,6164 ****
  }
  
  /*
!  * XLOG resource manager's routines
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
--- 6272,6326 ----
  }
  
  /*
!  * exitRecovery()
!  *
!  * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
!  * only record type that can record a change of timelineID. We assume
!  * caller has already set ThisTimeLineID, if appropriate.
!  */
! static XLogRecPtr
! exitRecovery(void)
! {
! 	XLogRecPtr	RecPtr;
! 	XLogRecData rdata;
! 
! 	rdata.buffer = InvalidBuffer;
! 	rdata.data = (char *) (&ThisTimeLineID);
! 	rdata.len = sizeof(TimeLineID);
! 	rdata.next = NULL;
! 
! 	/*
! 	 * If a restartpoint is in progress, send a second signal to
! 	 * get it go faster so we don't need to wait as long. Then wait for
! 	 * the lock, so we know the restartpoint has completed. It's
! 	 * difficult to interrupt it without risking robustness, plus any work
! 	 * it does now will be work  we can avoid later later when we checkpoint.
! 	 */
! 	if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
! 	{
! 		RequestRestartPointCompletion();
! 		elog(LOG, "startup process waiting for restartpoint to complete");
! 		LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
! 		LWLockRelease(CheckpointLock);
! 	}
! 
! 	/*
! 	 * This is the only type of WAL message that can be inserted during
! 	 * recovery. This ensures that we don't allow others to get access
! 	 * until after we have changed state.
! 	 */
! 	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
! 
! 	InRecovery = false;
! 
! 	return RecPtr;
! }
! 
! /*
!  * XLOG resource manager's routines.
!  *
!  * Definitions of message info are in include/catalog/pg_control.h,
!  * though not all messages relate to control file processing.
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
***************
*** 6193,6213 ****
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
  		/*
! 		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
  		 */
! 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
  		{
! 			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
  				!list_member_int(expectedTLIs,
! 								 (int) checkPoint.ThisTimeLineID))
  				ereport(PANIC,
! 						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
! 								checkPoint.ThisTimeLineID, ThisTimeLineID)));
  			/* Following WAL records should be run with new TLI */
! 			ThisTimeLineID = checkPoint.ThisTimeLineID;
  		}
- 
- 		RecoveryRestartPoint(&checkPoint);
  	}
  	else if (info == XLOG_CHECKPOINT_ONLINE)
  	{
--- 6355,6392 ----
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
  		/*
! 		 * TLI no longer changes at shutdown checkpoint, since as of 8.4,
! 		 * shutdown checkpoints only occur at shutdown. Much less confusing.
  		 */
! 
! 		RecoveryRestartPoint(&checkPoint);
! 	}
! 	else if (info == XLOG_RECOVERY_END)
! 	{
! 		TimeLineID	tli;
! 
! 		memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
! 
! 		/*
! 		 * TLI may change when recovery ends, but it shouldn't decrease.
! 		 *
! 		 * This is the only WAL record that can tell us to change timelineID
! 		 * while we process WAL records. 
! 		 *
! 		 * We can *choose* to stop recovery at any point, generating a
! 		 * new timelineID which is recorded using this record type.
! 		 */
! 		if (tli != ThisTimeLineID)
  		{
! 			if (tli < ThisTimeLineID ||
  				!list_member_int(expectedTLIs,
! 								 (int) tli))
  				ereport(PANIC,
! 						(errmsg("unexpected timeline ID %u (after %u) at recovery end record",
! 								tli, ThisTimeLineID)));
  			/* Following WAL records should be run with new TLI */
! 			ThisTimeLineID = tli;
  		}
  	}
  	else if (info == XLOG_CHECKPOINT_ONLINE)
  	{
***************
*** 6288,6294 ****
  }
  
  #ifdef WAL_DEBUG
- 
  static void
  xlog_outrec(StringInfo buf, XLogRecord *record)
  {
--- 6467,6472 ----
***************
*** 6308,6314 ****
  }
  #endif   /* WAL_DEBUG */
  
- 
  /*
   * Return the (possible) sync flag used for opening a file, depending on the
   * value of the GUC wal_sync_method.
--- 6486,6491 ----
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.51
diff -c -r1.51 bgwriter.c
*** src/backend/postmaster/bgwriter.c	11 Aug 2008 11:05:11 -0000	1.51
--- src/backend/postmaster/bgwriter.c	23 Sep 2008 14:56:29 -0000
***************
*** 49,54 ****
--- 49,55 ----
  #include <unistd.h>
  
  #include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
  #include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
***************
*** 130,135 ****
--- 131,143 ----
  
  	int			ckpt_flags;		/* checkpoint flags, as defined in xlog.h */
  
+ 	/* 
+ 	 * When the Startup process wants bgwriter to perform a restartpoint, it 
+ 	 * sets these fields so that we can update the control file afterwards.
+ 	 */
+ 	XLogRecPtr	ReadPtr;		/* ReadRecPtr for RestartPoint request */
+ 	CheckPoint *restartPoint;	/* restartPoint data for ControlFile */
+ 
  	uint32		num_backend_writes;		/* counts non-bgwriter buffer writes */
  
  	int			num_requests;	/* current # of requests */
***************
*** 166,172 ****
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
--- 174,180 ----
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;	/* not used if IsRecoveryProcessingMode */
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
***************
*** 198,203 ****
--- 206,212 ----
  {
  	sigjmp_buf	local_sigjmp_buf;
  	MemoryContext bgwriter_context;
+ 	bool		BgWriterRecoveryMode;
  
  	BgWriterShmem->bgwriter_pid = MyProcPid;
  	am_bg_writer = true;
***************
*** 356,371 ****
  	 */
  	PG_SETMASK(&UnBlockSig);
  
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
- 		bool		do_checkpoint = false;
- 		int			flags = 0;
- 		pg_time_t	now;
- 		int			elapsed_secs;
- 
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
--- 365,381 ----
  	 */
  	PG_SETMASK(&UnBlockSig);
  
+ 	BgWriterRecoveryMode = IsRecoveryProcessingMode();
+ 
+ 	if (BgWriterRecoveryMode)
+ 		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
+ 			BgWriterShmem->bgwriter_pid);
+ 
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
***************
*** 383,501 ****
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
- 		if (checkpoint_requested)
- 		{
- 			checkpoint_requested = false;
- 			do_checkpoint = true;
- 			BgWriterStats.m_requested_checkpoints++;
- 		}
- 		if (shutdown_requested)
- 		{
- 			/*
- 			 * From here on, elog(ERROR) should end with exit(1), not send
- 			 * control back to the sigsetjmp block above
- 			 */
- 			ExitOnAnyError = true;
- 			/* Close down the database */
- 			ShutdownXLOG(0, 0);
- 			DumpFreeSpaceMap(0, 0);
- 			/* Normal exit from the bgwriter is here */
- 			proc_exit(0);		/* done */
- 		}
  
! 		/*
! 		 * Force a checkpoint if too much time has elapsed since the last one.
! 		 * Note that we count a timed checkpoint in stats only when this
! 		 * occurs without an external request, but we set the CAUSE_TIME flag
! 		 * bit even if there is also an external request.
! 		 */
! 		now = (pg_time_t) time(NULL);
! 		elapsed_secs = now - last_checkpoint_time;
! 		if (elapsed_secs >= CheckPointTimeout)
  		{
! 			if (!do_checkpoint)
! 				BgWriterStats.m_timed_checkpoints++;
! 			do_checkpoint = true;
! 			flags |= CHECKPOINT_CAUSE_TIME;
  		}
! 
! 		/*
! 		 * Do a checkpoint if requested, otherwise do one cycle of
! 		 * dirty-buffer writing.
! 		 */
! 		if (do_checkpoint)
  		{
! 			/* use volatile pointer to prevent code rearrangement */
! 			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
  
  			/*
! 			 * Atomically fetch the request flags to figure out what kind of a
! 			 * checkpoint we should perform, and increase the started-counter
! 			 * to acknowledge that we've started a new checkpoint.
  			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			flags |= bgs->ckpt_flags;
! 			bgs->ckpt_flags = 0;
! 			bgs->ckpt_started++;
! 			SpinLockRelease(&bgs->ckpt_lck);
  
  			/*
! 			 * We will warn if (a) too soon since last checkpoint (whatever
! 			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 			 * since the last checkpoint start.  Note in particular that this
! 			 * implementation will not generate warnings caused by
! 			 * CheckPointTimeout < CheckPointWarning.
  			 */
! 			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 				elapsed_secs < CheckPointWarning)
! 				ereport(LOG,
! 						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 								elapsed_secs),
! 						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
  
! 			/*
! 			 * Initialize bgwriter-private variables used during checkpoint.
! 			 */
! 			ckpt_active = true;
! 			ckpt_start_recptr = GetInsertRecPtr();
! 			ckpt_start_time = now;
! 			ckpt_cached_elapsed = 0;
! 
! 			/*
! 			 * Do the checkpoint.
! 			 */
! 			CreateCheckPoint(flags);
  
! 			/*
! 			 * After any checkpoint, close all smgr files.	This is so we
! 			 * won't hang onto smgr references to deleted files indefinitely.
! 			 */
! 			smgrcloseall();
! 
! 			/*
! 			 * Indicate checkpoint completion to any waiting backends.
! 			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			bgs->ckpt_done = bgs->ckpt_started;
! 			SpinLockRelease(&bgs->ckpt_lck);
! 
! 			ckpt_active = false;
! 
! 			/*
! 			 * Note we record the checkpoint start time not end time as
! 			 * last_checkpoint_time.  This is so that time-driven checkpoints
! 			 * happen at a predictable spacing.
! 			 */
! 			last_checkpoint_time = now;
  		}
- 		else
- 			BgBufferSync();
- 
- 		/* Check for archive_timeout and switch xlog files if necessary. */
- 		CheckArchiveTimeout();
- 
- 		/* Nap for the configured time. */
- 		BgWriterNap();
  	}
  }
  
--- 393,599 ----
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
  
! 		if (BgWriterRecoveryMode)
  		{
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
! 
! 			if (!IsRecoveryProcessingMode())
! 			{
! 				elog(DEBUG2, "bgwriter changing from recovery to normal mode");
! 
! 				InitXLOGAccess();
! 				BgWriterRecoveryMode = false;
! 
! 				/*
! 				 * Start time-driven events from now
! 				 */
! 				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
! 
! 				/* 
! 				 * Notice that we do *not* act on a checkpoint_requested
! 				 * state at this point. We have changed mode, so we wish to
! 				 * perform a checkpoint not a restartpoint.
! 				 */
! 				continue;
! 			}
! 
! 			if (checkpoint_requested) 
! 			{
! 				XLogRecPtr		ReadPtr;
! 				CheckPoint		restartPoint;
! 
! 				checkpoint_requested = false;
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_time = (pg_time_t) time(NULL);
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Get the requested values from shared memory that the 
! 				 * Startup process has put there for us.
! 				 */
! 				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
! 				ReadPtr = BgWriterShmem->ReadPtr;
! 				memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
! 				SpinLockRelease(&BgWriterShmem->ckpt_lck);
! 
! 				/* Use smoothed writes, until interrupted if ever */
! 				CreateRestartPoint(ReadPtr, &restartPoint, 0);
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 
! 				ckpt_active = false;
! 				checkpoint_requested = false;
! 			}
! 			else
! 			{
! 				/* Clean buffers dirtied by recovery */
! 				BgBufferSync();
! 
! 				/* Nap for the configured time. */
! 				BgWriterNap();
! 			}
  		}
! 		else	/* Normal processing */
  		{
! 			bool		do_checkpoint = false;
! 			int			flags = 0;
! 			pg_time_t	now;
! 			int			elapsed_secs;
! 
! 			Assert(!IsRecoveryProcessingMode());
! 
! 			if (checkpoint_requested) 
! 			{
! 				checkpoint_requested = false;
! 				do_checkpoint = true;
! 				BgWriterStats.m_requested_checkpoints++;
! 			}
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Close down the database */
! 				ShutdownXLOG(0, 0);
! 				DumpFreeSpaceMap(0, 0);
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  
  			/*
! 			 * Force a checkpoint if too much time has elapsed since the last one.
! 			 * Note that we count a timed checkpoint in stats only when this
! 			 * occurs without an external request, but we set the CAUSE_TIME flag
! 			 * bit even if there is also an external request.
  			 */
! 			now = (pg_time_t) time(NULL);
! 			elapsed_secs = now - last_checkpoint_time;
! 			if (elapsed_secs >= CheckPointTimeout)
! 			{
! 				if (!do_checkpoint)
! 					BgWriterStats.m_timed_checkpoints++;
! 				do_checkpoint = true;
! 				flags |= CHECKPOINT_CAUSE_TIME;
! 			}
  
  			/*
! 			 * Do a checkpoint if requested, otherwise do one cycle of
! 			 * dirty-buffer writing.
  			 */
! 			if (do_checkpoint)
! 			{
! 				/* use volatile pointer to prevent code rearrangement */
! 				volatile BgWriterShmemStruct *bgs = BgWriterShmem;
! 
! 				/*
! 				 * Atomically fetch the request flags to figure out what kind of a
! 				 * checkpoint we should perform, and increase the started-counter
! 				 * to acknowledge that we've started a new checkpoint.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				flags |= bgs->ckpt_flags;
! 				bgs->ckpt_flags = 0;
! 				bgs->ckpt_started++;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				/*
! 				 * We will warn if (a) too soon since last checkpoint (whatever
! 				 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 				 * since the last checkpoint start.  Note in particular that this
! 				 * implementation will not generate warnings caused by
! 				 * CheckPointTimeout < CheckPointWarning.
! 				 */
! 				if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 					elapsed_secs < CheckPointWarning)
! 					ereport(LOG,
! 							(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 									elapsed_secs),
! 							 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_recptr = GetInsertRecPtr();
! 				ckpt_start_time = now;
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Do the checkpoint.
! 				 */
! 				CreateCheckPoint(flags);
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 
! 				/*
! 				 * Indicate checkpoint completion to any waiting backends.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				bgs->ckpt_done = bgs->ckpt_started;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				ckpt_active = false;
! 
! 				/*
! 				 * Note we record the checkpoint start time not end time as
! 				 * last_checkpoint_time.  This is so that time-driven checkpoints
! 				 * happen at a predictable spacing.
! 				 */
! 				last_checkpoint_time = now;
! 			}
! 			else
! 				BgBufferSync();
  
! 			/* Check for archive_timeout and switch xlog files if necessary. */
! 			CheckArchiveTimeout();
  
! 			/* Nap for the configured time. */
! 			BgWriterNap();
  		}
  	}
  }
  
***************
*** 588,594 ****
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
--- 686,693 ----
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		if (!IsRecoveryProcessingMode())
! 			AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
***************
*** 642,647 ****
--- 741,759 ----
  	if (!am_bg_writer)
  		return;
  
+ 	/* Perform minimal duties during recovery and skip wait if requested */
+ 	if (IsRecoveryProcessingMode())
+ 	{
+ 		BgBufferSync();
+ 
+ 		if (!shutdown_requested &&
+ 			!checkpoint_requested &&
+ 			IsCheckpointOnSchedule(progress))
+ 			BgWriterNap();
+ 
+ 		return;
+ 	}
+ 
  	/*
  	 * Perform the usual bgwriter duties and take a nap, unless we're behind
  	 * schedule, in which case we just try to catch up as quickly as possible.
***************
*** 716,731 ****
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	recptr = GetInsertRecPtr();
! 	elapsed_xlogs =
! 		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 		CheckPointSegments;
! 
! 	if (progress < elapsed_xlogs)
  	{
! 		ckpt_cached_elapsed = elapsed_xlogs;
! 		return false;
  	}
  
  	/*
--- 828,846 ----
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	if (!IsRecoveryProcessingMode())
  	{
! 		recptr = GetInsertRecPtr();
! 		elapsed_xlogs =
! 			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 			CheckPointSegments;
! 
! 		if (progress < elapsed_xlogs)
! 		{
! 			ckpt_cached_elapsed = elapsed_xlogs;
! 			return false;
! 		}
  	}
  
  	/*
***************
*** 967,972 ****
--- 1082,1130 ----
  }
  
  /*
+  * Always runs in Startup process (see xlog.c)
+  */
+ void
+ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+ {
+ 	/*
+ 	 * Should we just do it ourselves?
+ 	 */
+ 	if (!IsPostmasterEnvironment || !sendToBGWriter)
+ 	{
+ 		CreateRestartPoint(ReadPtr, restartPoint, CHECKPOINT_IMMEDIATE);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Push requested values into shared memory, then signal to request restartpoint.
+ 	 */
+ 	if (BgWriterShmem->bgwriter_pid == 0)
+ 		elog(LOG, "could not request restartpoint because bgwriter not running");
+ 
+ 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ 	BgWriterShmem->ReadPtr = ReadPtr;
+ 	memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+ 	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+ 
+ 	if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ 		elog(LOG, "could not signal for restartpoint: %m");	
+ }
+ 
+ /* 
+  * Sends another checkpoint request signal to bgwriter, which causes it
+  * to avoid smoothed writes and continue processing as if it had been
+  * called with CHECKPOINT_IMMEDIATE. This is used at the end of recovery.
+  */
+ void
+ RequestRestartPointCompletion(void)
+ {
+ 	if (BgWriterShmem->bgwriter_pid != 0 &&
+ 		kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ 		elog(LOG, "could not signal for restartpoint immediate: %m");
+ }
+ 
+ /*
   * ForwardFsyncRequest
   *		Forward a file-fsync request from a backend to the bgwriter
   *
Index: src/backend/postmaster/postmaster.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v
retrieving revision 1.561
diff -c -r1.561 postmaster.c
*** src/backend/postmaster/postmaster.c	26 Jun 2008 02:47:19 -0000	1.561
--- src/backend/postmaster/postmaster.c	22 Sep 2008 20:25:36 -0000
***************
*** 254,259 ****
--- 254,264 ----
  {
  	PM_INIT,					/* postmaster starting */
  	PM_STARTUP,					/* waiting for startup subprocess */
+ 	PM_RECOVERY,				/* consistent recovery mode; state only
+ 								 * entered for archive and streaming recovery,
+ 								 * and only after the point where the 
+ 								 * all data is in consistent state.
+ 								 */
  	PM_RUN,						/* normal "database is alive" state */
  	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
  	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
***************
*** 1294,1300 ****
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && pmState == PM_RUN)
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
--- 1299,1305 ----
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
***************
*** 2104,2110 ****
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
--- 2109,2115 ----
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
***************
*** 2145,2155 ****
  			load_role();
  
  			/*
! 			 * Crank up the background writer.	It doesn't matter if this
! 			 * fails, we'll just try again later.
  			 */
! 			Assert(BgWriterPID == 0);
! 			BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
--- 2150,2160 ----
  			load_role();
  
  			/*
! 			 * Check whether we need to start background writer, if not
! 			 * already running.
  			 */
! 			if (BgWriterPID == 0)
! 				BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
***************
*** 3821,3826 ****
--- 3826,3876 ----
  
  	PG_SETMASK(&BlockSig);
  
+ 	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ 	{
+ 		Assert(pmState == PM_STARTUP);
+ 
+ 		/*
+ 		 * Go to shutdown mode if a shutdown request was pending.
+ 		 */
+ 		if (Shutdown > NoShutdown)
+ 		{
+ 			pmState = PM_WAIT_BACKENDS;
+ 			/* PostmasterStateMachine logic does the rest */
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * Startup process has entered recovery
+ 			 */
+ 			pmState = PM_RECOVERY;
+ 
+ 			/*
+ 			 * Load the flat authorization file into postmaster's cache. The
+ 			 * startup process won't have recomputed this from the database yet,
+ 			 * so we it may change following recovery. 
+ 			 */
+ 			load_role();
+ 
+ 			/*
+ 			 * Crank up the background writer.	It doesn't matter if this
+ 			 * fails, we'll just try again later.
+ 			 */
+ 			Assert(BgWriterPID == 0);
+ 			BgWriterPID = StartBackgroundWriter();
+ 
+ 			/*
+ 			 * Likewise, start other special children as needed.
+ 			 */
+ 			Assert(PgStatPID == 0);
+ 			PgStatPID = pgstat_start();
+ 
+ 			/* XXX at this point we could accept read-only connections */
+ 			ereport(DEBUG1,
+ 				 (errmsg("database system is in consistent recovery mode")));
+ 		}
+ 	}
+ 
  	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
  	{
  		/*
Index: src/backend/storage/buffer/README
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/README,v
retrieving revision 1.14
diff -c -r1.14 README
*** src/backend/storage/buffer/README	21 Mar 2008 13:23:28 -0000	1.14
--- src/backend/storage/buffer/README	22 Sep 2008 21:54:20 -0000
***************
*** 264,266 ****
--- 264,271 ----
  This ensures that the page image transferred to disk is reasonably consistent.
  We might miss a hint-bit update or two but that isn't a problem, for the same
  reasons mentioned under buffer access rules.
+ 
+ As of 8.4, background writer starts during recovery mode when there is
+ some form of potentially extended recovery to perform. It performs an
+ identical service to normal processing, except that checkpoints it
+ writes are technically restartpoints nor needs to flush WAL for dirty buffers.
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v
retrieving revision 1.88
diff -c -r1.88 xlog.h
*** src/include/access/xlog.h	12 May 2008 08:35:05 -0000	1.88
--- src/include/access/xlog.h	22 Sep 2008 19:04:56 -0000
***************
*** 133,139 ****
  } XLogRecData;
  
  extern TimeLineID ThisTimeLineID;		/* current TLI */
! extern bool InRecovery;
  extern XLogRecPtr XactLastRecEnd;
  
  /* these variables are GUC parameters related to XLOG */
--- 133,148 ----
  } XLogRecData;
  
  extern TimeLineID ThisTimeLineID;		/* current TLI */
! 
! /* 
!  * Prior to 8.4, all activity during recovery were carried out by Startup
!  * process. This local variable continues to be used in many parts of the
!  * code to indicate actions taken by RecoveryManagers. Other processes who
!  * potentially perform work during recovery should check
!  * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
!  */
! extern bool InRecovery;	
! 										
  extern XLogRecPtr XactLastRecEnd;
  
  /* these variables are GUC parameters related to XLOG */
***************
*** 166,171 ****
--- 175,181 ----
  /* These indicate the cause of a checkpoint request */
  #define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
  #define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
+ #define CHECKPOINT_RESTARTPOINT	0x0040	/* Restartpoint during recovery */
  
  /* Checkpoint statistics */
  typedef struct CheckpointStatsData
***************
*** 197,202 ****
--- 207,214 ----
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
  
+ extern bool IsRecoveryProcessingMode(void);
+ 
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
  extern void XLOGShmemInit(void);
Index: src/include/access/xlog_internal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog_internal.h,v
retrieving revision 1.24
diff -c -r1.24 xlog_internal.h
*** src/include/access/xlog_internal.h	11 Aug 2008 11:05:11 -0000	1.24
--- src/include/access/xlog_internal.h	23 Sep 2008 10:59:40 -0000
***************
*** 17,22 ****
--- 17,23 ----
  #define XLOG_INTERNAL_H
  
  #include "access/xlog.h"
+ #include "catalog/pg_control.h"
  #include "fmgr.h"
  #include "pgtime.h"
  #include "storage/block.h"
***************
*** 245,250 ****
--- 246,254 ----
  extern pg_time_t GetLastSegSwitchTime(void);
  extern XLogRecPtr RequestXLogSwitch(void);
  
+ extern void CreateRestartPoint(const XLogRecPtr ReadPtr, 
+ 				const CheckPoint *restartPoint, int flags);
+ 
  /*
   * These aren't in xlog.h because I'd rather not include fmgr.h there.
   */
Index: src/include/catalog/pg_control.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/catalog/pg_control.h,v
retrieving revision 1.41
diff -c -r1.41 pg_control.h
*** src/include/catalog/pg_control.h	21 Apr 2008 00:26:47 -0000	1.41
--- src/include/catalog/pg_control.h	22 Sep 2008 19:21:37 -0000
***************
*** 46,52 ****
  #define XLOG_NOOP						0x20
  #define XLOG_NEXTOID					0x30
  #define XLOG_SWITCH						0x40
! 
  
  /* System status indicator */
  typedef enum DBState
--- 46,52 ----
  #define XLOG_NOOP						0x20
  #define XLOG_NEXTOID					0x30
  #define XLOG_SWITCH						0x40
! #define XLOG_RECOVERY_END			0x50
  
  /* System status indicator */
  typedef enum DBState
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.12
diff -c -r1.12 bgwriter.h
*** src/include/postmaster/bgwriter.h	11 Aug 2008 11:05:11 -0000	1.12
--- src/include/postmaster/bgwriter.h	23 Sep 2008 11:12:18 -0000
***************
*** 12,17 ****
--- 12,18 ----
  #ifndef _BGWRITER_H
  #define _BGWRITER_H
  
+ #include "catalog/pg_control.h"
  #include "storage/block.h"
  #include "storage/relfilenode.h"
  
***************
*** 25,30 ****
--- 26,33 ----
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
+ extern void RequestRestartPointCompletion(void);
  extern void CheckpointWriteDelay(int flags, double progress);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
Index: src/include/storage/pmsignal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v
retrieving revision 1.20
diff -c -r1.20 pmsignal.h
*** src/include/storage/pmsignal.h	19 Jun 2008 21:32:56 -0000	1.20
--- src/include/storage/pmsignal.h	22 Sep 2008 15:54:57 -0000
***************
*** 22,27 ****
--- 22,28 ----
   */
  typedef enum
  {
+ 	PMSIGNAL_RECOVERY_START,	/* move to PM_RECOVERY state */
  	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
  	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
  	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [PATCHES] [HACKERS] Infrastructure changes for recovery

Reply via email to