Re: [PATCHES] [HACKERS] Infrastructure changes for recovery

Simon Riggs Mon, 22 Sep 2008 15:06:51 -0700

On Thu, 2008-09-18 at 10:09 -0400, Tom Lane wrote:
> Simon Riggs <[EMAIL PROTECTED]> writes:
> > On Thu, 2008-09-18 at 09:06 -0400, Tom Lane wrote:
> >> Do we really need a checkpoint there at all?
> 
> > "Timelines only change at shutdown checkpoints".
> 
> Hmm.  I *think* that that is just a debugging crosscheck rather than a
> critical property.  But yeah, it would take some close investigation,
> which maybe isn't warranted if you have a less-invasive solution.


OK, new patch, version 6. Some major differences to previous patch.

* new IsRecoveryProcessingMode() in shmem
* padding in XLogCtl to ensure above call is cheap
* specific part of bgwriter shmem for passing restartpoint data
* avoid Shutdown checkpoint at end of recovery, with carefully
considered positioning of statements (beware!)
* only one new postmaster mode, PM_RECOVERY
* bgwriter changes state without stopping/starting

Modes I have tested so far
* make check
* Start, Stop
* Crash Recovery
* Archive Recovery
* Archive Recovery, switch in middle of restartpoint

Modes not yet tested
* EXEC_BACKEND

Ready for serious review prior to commit. I will be performing further
testing also.

 backend/access/transam/multixact.c |    2 
 backend/access/transam/xlog.c      |  328 ++++++++++++---!!!!!!!!!!!!
 backend/postmaster/bgwriter.c      |  371 +++++---!!!!!!!!!!!!!!!!!!!!!
 backend/postmaster/postmaster.c    |   62 ++++!!
 backend/storage/buffer/README      |    5 
 backend/storage/buffer/bufmgr.c    |   34 +!!
 include/access/xlog.h              |   14 !
 include/access/xlog_internal.h     |    3 
 include/catalog/pg_control.h       |    2 
 include/postmaster/bgwriter.h      |    2 
 include/storage/bufmgr.h           |    2 
 include/storage/pmsignal.h         |    1 
 12 files changed, 279 insertions(+), 56 deletions(-), 491 mods(!)

There's a few subtle points along the way. I've tried to explain them
all in code comments, but questions welcome. At v6, most things are now
done a particular way for a specific reason.

Look especially at InRecovery, which is used extensively in different
parts of the code. The meaning of this has been subdivided into two
meanings, so only *some* of the places that use it have been changed.
All have been checked.

-- 
 Simon Riggs           www.2ndQuadrant.com
 PostgreSQL Training, Services and Support

Index: src/backend/access/transam/multixact.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/multixact.c,v
retrieving revision 1.28
diff -c -r1.28 multixact.c
*** src/backend/access/transam/multixact.c	1 Aug 2008 13:16:08 -0000	1.28
--- src/backend/access/transam/multixact.c	22 Sep 2008 19:28:56 -0000
***************
*** 1543,1549 ****
  	 * SimpleLruTruncate would get confused.  It seems best not to risk
  	 * removing any data during recovery anyway, so don't truncate.
  	 */
! 	if (!InRecovery)
  		TruncateMultiXact();
  
  	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
--- 1543,1549 ----
  	 * SimpleLruTruncate would get confused.  It seems best not to risk
  	 * removing any data during recovery anyway, so don't truncate.
  	 */
! 	if (!IsRecoveryProcessingMode())
  		TruncateMultiXact();
  
  	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.317
diff -c -r1.317 xlog.c
*** src/backend/access/transam/xlog.c	11 Aug 2008 11:05:10 -0000	1.317
--- src/backend/access/transam/xlog.c	22 Sep 2008 21:30:24 -0000
***************
*** 119,124 ****
--- 119,125 ----
  
  /* Are we doing recovery from XLOG? */
  bool		InRecovery = false;
+ bool		reachedSafeStopPoint = false;
  
  /* Are we recovering using offline XLOG archives? */
  static bool InArchiveRecovery = false;
***************
*** 131,137 ****
  static bool recoveryTarget = false;
  static bool recoveryTargetExact = false;
  static bool recoveryTargetInclusive = true;
- static bool recoveryLogRestartpoints = false;
  static TransactionId recoveryTargetXid;
  static TimestampTz recoveryTargetTime;
  static TimestampTz recoveryLastXTime = 0;
--- 132,137 ----
***************
*** 286,295 ****
--- 286,297 ----
  /*
   * Total shared-memory state for XLOG.
   */
+ #define	XLOGCTL_BUFFER_SPACING	128
  typedef struct XLogCtlData
  {
  	/* Protected by WALInsertLock: */
  	XLogCtlInsert Insert;
+ 	char	InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)];
  
  	/* Protected by info_lck: */
  	XLogwrtRqst LogwrtRqst;
***************
*** 297,305 ****
--- 299,314 ----
  	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
  	TransactionId ckptXid;
  	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
+ 	/* add data structure padding for above info_lck declarations */
+ 	char	InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst) 
+ 												- sizeof(XLogwrtResult)
+ 												- sizeof(uint32)
+ 												- sizeof(TransactionId)
+ 												- sizeof(XLogRecPtr)];
  
  	/* Protected by WALWriteLock: */
  	XLogCtlWrite Write;
+ 	char	WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)];
  
  	/*
  	 * These values do not change after startup, although the pointed-to pages
***************
*** 311,316 ****
--- 320,344 ----
  	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
  	TimeLineID	ThisTimeLineID;
  
+ 	/*
+ 	 * IsRecoveryProcessingMode shows whether the postmaster is in a
+ 	 * postmaster state earlier than PM_RUN, or not. This is a globally
+ 	 * accessible state to allow EXEC_BACKEND case.
+ 	 *
+ 	 * We also retain a local state variable InRecovery. InRecovery=true
+ 	 * means the code is being executed by Startup process and therefore
+ 	 * always during RecoveryProcessingMode. This allows us to retain the
+ 	 * often important distinction between code executed *during* 
+ 	 * RecoveryProcessingMode and but not necessarily by Startup process.
+ 	 *
+ 	 * Reviewer's note: all call points InRecovery and InRedo have been checked
+ 	 * for correctness and have been changed to IsRecoveryProcessingMode()
+ 	 * if appropriate.
+ 	 */
+ 	bool		IsRecoveryProcessingMode;
+ 
+ 	char		InfoLockPadding[XLOGCTL_BUFFER_SPACING];
+ 
  	slock_t		info_lck;		/* locks shared variables shown above */
  } XLogCtlData;
  
***************
*** 396,401 ****
--- 424,430 ----
  static void readRecoveryCommandFile(void);
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
+ static XLogRecPtr exitRecovery(void);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
  static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
  
***************
*** 479,484 ****
--- 508,518 ----
  	bool		updrqst;
  	bool		doPageWrites;
  	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ 	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+ 
+ 	/* cross-check on whether we should be here or not */
+ 	if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+ 		elog(FATAL, "cannot make new WAL entries during recovery");
  
  	/* info's high bits are reserved for use by me */
  	if (info & XLR_INFO_MASK)
***************
*** 1677,1684 ****
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	/* Disabled during REDO */
! 	if (InRedo)
  		return;
  
  	/* Quick exit if already known flushed */
--- 1711,1717 ----
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	if (IsRecoveryProcessingMode())
  		return;
  
  	/* Quick exit if already known flushed */
***************
*** 1766,1774 ****
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while InRedo is true, but if the bad page is brought in
! 	 * and marked dirty during recovery then CreateCheckPoint will try to
! 	 * flush it at the end of recovery.)
  	 *
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
--- 1799,1807 ----
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
! 	 * brought in and marked dirty during recovery then CreateCheckPoint will
! 	 * try to flush it at the end of recovery.)
  	 *
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
***************
*** 2051,2057 ****
  		unlink(tmppath);
  	}
  
! 	elog(DEBUG2, "done creating and filling new WAL file");
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
--- 2084,2091 ----
  		unlink(tmppath);
  	}
  
! 	XLogFileName(tmppath, ThisTimeLineID, log, seg);
! 	elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
***************
*** 4532,4546 ****
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
- 			/*
- 			 * does nothing if a recovery_target is not also set
- 			 */
- 			if (!parse_bool(tok2, &recoveryLogRestartpoints))
- 				  ereport(ERROR,
- 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
  			ereport(LOG,
! 					(errmsg("log_restartpoints = %s", tok2)));
  		}
  		else
  			ereport(FATAL,
--- 4566,4574 ----
  		}
  		else if (strcmp(tok1, "log_restartpoints") == 0)
  		{
  			ereport(LOG,
! 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 					  errmsg("parameter \"log_restartpoints\" has been deprecated")));
  		}
  		else
  			ereport(FATAL,
***************
*** 4811,4828 ****
  	CheckPoint	checkPoint;
  	bool		wasShutdown;
  	bool		reachedStopPoint = false;
  	bool		haveBackupLabel = false;
  	XLogRecPtr	RecPtr,
  				LastRec,
  				checkPointLoc,
  				minRecoveryLoc,
! 				EndOfLog;
  	uint32		endLogId;
  	uint32		endLogSeg;
  	XLogRecord *record;
  	uint32		freespace;
  	TransactionId oldestActiveXID;
  
  	/*
  	 * Read control file and check XLOG status looks valid.
  	 *
--- 4839,4860 ----
  	CheckPoint	checkPoint;
  	bool		wasShutdown;
  	bool		reachedStopPoint = false;
+ 	bool		performedArchiveRecovery = false;
  	bool		haveBackupLabel = false;
  	XLogRecPtr	RecPtr,
  				LastRec,
  				checkPointLoc,
  				minRecoveryLoc,
! 				EndOfLog,
! 				RecoveryCompletionPtr;
  	uint32		endLogId;
  	uint32		endLogSeg;
  	XLogRecord *record;
  	uint32		freespace;
  	TransactionId oldestActiveXID;
  
+ 	XLogCtl->IsRecoveryProcessingMode = true;
+ 
  	/*
  	 * Read control file and check XLOG status looks valid.
  	 *
***************
*** 5039,5044 ****
--- 5071,5081 ----
  		UpdateControlFile();
  
  		/*
+ 		 * Reset pgstat data, because it may be invalid after recovery.
+ 		 */
+ 		pgstat_reset_all();
+ 
+ 		/*
  		 * If there was a backup label file, it's done its job and the info
  		 * has now been propagated into pg_control.  We must get rid of the
  		 * label file so that if we crash during recovery, we'll pick up at
***************
*** 5148,5153 ****
--- 5185,5205 ----
  
  				LastRec = ReadRecPtr;
  
+ 				/*
+ 				 * Have we reached our safe stopping point? If so, we can
+ 				 * signal Postmaster to enter consistent recovery mode
+ 				 */
+ 				if (!reachedSafeStopPoint && 
+ 					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ 				{
+ 					reachedSafeStopPoint = true;
+ 					ereport(LOG,
+ 						(errmsg("consistent recovery state reached at %X/%X",
+ 							EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ 					if (IsUnderPostmaster)
+ 						SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ 				}
+ 
  				record = ReadRecord(NULL, LOG);
  			} while (record != NULL && recoveryContinue);
  
***************
*** 5169,5174 ****
--- 5221,5227 ----
  			/* there are no WAL records following the checkpoint */
  			ereport(LOG,
  					(errmsg("redo is not required")));
+ 			reachedSafeStopPoint = true;
  		}
  	}
  
***************
*** 5184,5190 ****
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
--- 5237,5243 ----
  	 * Complain if we did not roll forward far enough to render the backup
  	 * dump consistent.
  	 */
! 	if (InRecovery && !reachedSafeStopPoint)
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
***************
*** 5227,5233 ****
--- 5280,5289 ----
  	 * we will use that below.)
  	 */
  	if (InArchiveRecovery)
+ 	{
+ 		performedArchiveRecovery = true;
  		exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
+ 	}
  
  	/*
  	 * Prepare to write WAL starting at EndOfLog position, and init xlog
***************
*** 5286,5291 ****
--- 5342,5349 ----
  	/* Pre-scan prepared transactions to find out the range of XIDs present */
  	oldestActiveXID = PrescanPreparedTransactions();
  
+ 	RecoveryCompletionPtr = EndOfLog;
+ 
  	if (InRecovery)
  	{
  		int			rmid;
***************
*** 5306,5343 ****
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Reset pgstat data, because it may be invalid after recovery.
! 		 */
! 		pgstat_reset_all();
! 
! 		/*
! 		 * Perform a checkpoint to update all our recovery activity to disk.
! 		 *
! 		 * Note that we write a shutdown checkpoint rather than an on-line
! 		 * one. This is not particularly critical, but since we may be
! 		 * assigning a new TLI, using a shutdown checkpoint allows us to have
! 		 * the rule that TLI only changes in shutdown checkpoints, which
! 		 * allows some extra error checking in xlog_redo.
  		 */
! 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
  	}
  
- 	/*
- 	 * Preallocate additional log files, if wanted.
- 	 */
- 	PreallocXlogFiles(EndOfLog);
- 
- 	/*
- 	 * Okay, we're officially UP.
- 	 */
- 	InRecovery = false;
- 
- 	ControlFile->state = DB_IN_PRODUCTION;
- 	ControlFile->time = (pg_time_t) time(NULL);
- 	UpdateControlFile();
- 
  	/* start the archive_timeout timer running */
! 	XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
  
  	/* initialize shared-memory copy of latest checkpoint XID/epoch */
  	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
--- 5364,5377 ----
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
! 		 * a shutdown checkpoint here, but we ask bgwriter to do that now.
  		 */
! 		RecoveryCompletionPtr = exitRecovery();
  	}
  
  	/* start the archive_timeout timer running */
! 	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
  
  	/* initialize shared-memory copy of latest checkpoint XID/epoch */
  	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
***************
*** 5372,5377 ****
--- 5406,5446 ----
  		readRecordBuf = NULL;
  		readRecordBufSize = 0;
  	}
+ 
+ 	/*
+ 	 * Okay, we're officially UP.
+ 	 */
+ 	XLogCtl->IsRecoveryProcessingMode = false;
+ 
+ 	/*
+ 	 * If we had to perform archive recovery we don't mark the control file,
+ 	 * yet, since we haven't definitely got a safe point to recover from that
+ 	 * doesn't rely on archived WAL files. So we switch quickly into normal 
+ 	 * processing and rely on the bgwriter's checkpoint (NOT a restartpoint)
+ 	 * to define a safe recovery point and put us into full production state. 
+ 	 * We specifically do not want to wait for checkpoint completion here,
+ 	 * so we can reduce startup time in a standby mode replication failover.
+ 	 * The checkpoint creation will also flush WAL, so we wait for that
+ 	 * otherwise we may need to prepare WAL files ourselves.
+ 	 *
+ 	 * If we are doing crash recovery, we know we have WAL files accessible
+ 	 * so we just get started again as quickly as possible.
+ 	 */
+ 	if (performedArchiveRecovery)
+ 		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ 	else
+ 	{
+ 		XLogFlush(RecoveryCompletionPtr);
+ 		ControlFile->state = DB_IN_PRODUCTION;
+ 		ControlFile->time = (pg_time_t) time(NULL);
+ 		UpdateControlFile();
+ 	}
+ }
+ 
+ bool
+ IsRecoveryProcessingMode(void)
+ {
+ 	return XLogCtl->IsRecoveryProcessingMode;
  }
  
  /*
***************
*** 5642,5648 ****
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(void)
  {
  	long		write_secs,
  				sync_secs,
--- 5711,5717 ----
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(int flags)
  {
  	long		write_secs,
  				sync_secs,
***************
*** 5665,5681 ****
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! 		 "%d transaction log file(s) added, %d removed, %d recycled; "
! 		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 		 CheckpointStats.ckpt_bufs_written,
! 		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 		 CheckpointStats.ckpt_segs_added,
! 		 CheckpointStats.ckpt_segs_removed,
! 		 CheckpointStats.ckpt_segs_recycled,
! 		 write_secs, write_usecs / 1000,
! 		 sync_secs, sync_usecs / 1000,
! 		 total_secs, total_usecs / 1000);
  }
  
  /*
--- 5734,5759 ----
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	if (flags & CHECKPOINT_RESTARTPOINT)
! 		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 			 CheckpointStats.ckpt_bufs_written,
! 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 			 write_secs, write_usecs / 1000,
! 			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
! 	else
! 		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! 			 "%d transaction log file(s) added, %d removed, %d recycled; "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 			 CheckpointStats.ckpt_bufs_written,
! 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 			 CheckpointStats.ckpt_segs_added,
! 			 CheckpointStats.ckpt_segs_removed,
! 			 CheckpointStats.ckpt_segs_recycled,
! 			 write_secs, write_usecs / 1000,
! 			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
  }
  
  /*
***************
*** 5944,5949 ****
--- 6022,6029 ----
  	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	if (shutdown)
  		ControlFile->state = DB_SHUTDOWNED;
+ 	else
+ 		ControlFile->state = DB_IN_PRODUCTION;
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
  	ControlFile->checkPoint = ProcLastRecPtr;
  	ControlFile->checkPointCopy = checkPoint;
***************
*** 6002,6008 ****
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd();
  
  	LWLockRelease(CheckpointLock);
  }
--- 6082,6088 ----
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd(flags);
  
  	LWLockRelease(CheckpointLock);
  }
***************
*** 6071,6099 ****
  			}
  	}
  
  	/*
! 	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
  
  	/*
! 	 * Update pg_control so that any subsequent crash will restart from this
! 	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
! 	 * record itself.
  	 */
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
! 	ControlFile->checkPoint = ReadRecPtr;
! 	ControlFile->checkPointCopy = *checkPoint;
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
  
! 	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
! 					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! 	if (recoveryLastXTime)
! 		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! 				(errmsg("last completed transaction was at log time %s",
! 						timestamptz_to_str(recoveryLastXTime))));
  }
  
  /*
--- 6151,6215 ----
  			}
  	}
  
+ 	if (recoveryLastXTime)
+ 		ereport((log_checkpoints ? LOG : DEBUG2),
+ 				(errmsg("last completed transaction was at log time %s",
+ 						timestamptz_to_str(recoveryLastXTime))));
+ 
+ 	RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStopPoint);
+ }
+ 
+ /*
+  * As of 8.4, RestartPoints are always created by the bgwriter
+  * once we have reachedSafeStopPoint. We use bgwriter's shared memory
+  * area wherever we call it from, to keep better code structure.
+  */
+ void
+ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint)
+ {
+ 	if (log_checkpoints)
+ 	{
+ 		/*
+ 		 * Prepare to accumulate statistics.
+ 		 */
+ 
+ 		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ 		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+ 
+ 		/*
+ 		 * Do the restartpoint equivalent of LogCheckpointStart()
+ 		 */
+ 		elog(LOG, "restartpoint starting");
+ 	}
+ 
+ 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+ 
  	/*
! 	 * OK, write out dirty blocks smoothly
  	 */
! 	CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT);
  
  	/*
! 	 * Update pg_control, using current time
  	 */
+ 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
! 	ControlFile->checkPoint = ReadPtr;
! 	ControlFile->checkPointCopy = *restartPoint;
  	ControlFile->time = (pg_time_t) time(NULL);
  	UpdateControlFile();
+ 	LWLockRelease(ControlFileLock);
+ 
+ 	/* All real work is done, but log before releasing lock. */
+ 	if (log_checkpoints)
+ 		LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
  
! 	ereport((log_checkpoints ? LOG : DEBUG2),
  			(errmsg("recovery restart point at %X/%X",
! 					restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
! 
! 	LWLockRelease(CheckpointLock);
! 
  }
  
  /*
***************
*** 6158,6164 ****
  }
  
  /*
!  * XLOG resource manager's routines
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
--- 6274,6313 ----
  }
  
  /*
!  * exitRecovery()
!  *
!  * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
!  * only record type that can record a change of timelineID. We assume
!  * caller has already set ThisTimeLineID, if appropriate.
!  */
! static XLogRecPtr
! exitRecovery(void)
! {
! 	XLogRecPtr	RecPtr;
! 	XLogRecData rdata;
! 
! 	rdata.buffer = InvalidBuffer;
! 	rdata.data = (char *) (&ThisTimeLineID);
! 	rdata.len = sizeof(TimeLineID);
! 	rdata.next = NULL;
! 
! 	/*
! 	 * This is the only type of WAL message that can be inserted during
! 	 * recovery. This ensures that we don't allow others to get access
! 	 * until after we have changed state.
! 	 */
! 	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
! 
! 	InRecovery = false;
! 
! 	return RecPtr;
! }
! 
! /*
!  * XLOG resource manager's routines.
!  *
!  * Definitions of message info are in include/catalog/pg_control.h,
!  * though not all messages relate to control file processing.
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
***************
*** 6193,6213 ****
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
  		/*
! 		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
  		 */
! 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
  		{
! 			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
  				!list_member_int(expectedTLIs,
! 								 (int) checkPoint.ThisTimeLineID))
  				ereport(PANIC,
! 						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
! 								checkPoint.ThisTimeLineID, ThisTimeLineID)));
  			/* Following WAL records should be run with new TLI */
! 			ThisTimeLineID = checkPoint.ThisTimeLineID;
  		}
- 
- 		RecoveryRestartPoint(&checkPoint);
  	}
  	else if (info == XLOG_CHECKPOINT_ONLINE)
  	{
--- 6342,6379 ----
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
  		/*
! 		 * TLI no longer changes at shutdown checkpoint, since as of 8.4,
! 		 * shutdown checkpoints only occur at shutdown. Much less confusing.
  		 */
! 
! 		RecoveryRestartPoint(&checkPoint);
! 	}
! 	else if (info == XLOG_RECOVERY_END)
! 	{
! 		TimeLineID	tli;
! 
! 		memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
! 
! 		/*
! 		 * TLI may change when recovery ends, but it shouldn't decrease.
! 		 *
! 		 * This is the only WAL record that can tell us to change timelineID
! 		 * while we process WAL records. 
! 		 *
! 		 * We can *choose* to stop recovery at any point, generating a
! 		 * new timelineID which is recorded using this record type.
! 		 */
! 		if (tli != ThisTimeLineID)
  		{
! 			if (tli < ThisTimeLineID ||
  				!list_member_int(expectedTLIs,
! 								 (int) tli))
  				ereport(PANIC,
! 						(errmsg("unexpected timeline ID %u (after %u) at recovery end record",
! 								tli, ThisTimeLineID)));
  			/* Following WAL records should be run with new TLI */
! 			ThisTimeLineID = tli;
  		}
  	}
  	else if (info == XLOG_CHECKPOINT_ONLINE)
  	{
Index: src/backend/postmaster/bgwriter.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/bgwriter.c,v
retrieving revision 1.51
diff -c -r1.51 bgwriter.c
*** src/backend/postmaster/bgwriter.c	11 Aug 2008 11:05:11 -0000	1.51
--- src/backend/postmaster/bgwriter.c	22 Sep 2008 20:58:59 -0000
***************
*** 49,54 ****
--- 49,55 ----
  #include <unistd.h>
  
  #include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
  #include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
***************
*** 130,135 ****
--- 131,143 ----
  
  	int			ckpt_flags;		/* checkpoint flags, as defined in xlog.h */
  
+ 	/* 
+ 	 * When the Startup process wants bgwriter to perform a restartpoint, it 
+ 	 * sets these fields so that we can update the control file afterwards.
+ 	 */
+ 	XLogRecPtr	ReadPtr;		/* ReadRecPtr for RestartPoint request */
+ 	CheckPoint *restartPoint;	/* restartPoint data for ControlFile */
+ 
  	uint32		num_backend_writes;		/* counts non-bgwriter buffer writes */
  
  	int			num_requests;	/* current # of requests */
***************
*** 166,172 ****
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
--- 174,180 ----
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;	/* not used if IsRecoveryProcessingMode */
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
***************
*** 198,203 ****
--- 206,212 ----
  {
  	sigjmp_buf	local_sigjmp_buf;
  	MemoryContext bgwriter_context;
+ 	bool		BgWriterRecoveryMode;
  
  	BgWriterShmem->bgwriter_pid = MyProcPid;
  	am_bg_writer = true;
***************
*** 356,371 ****
  	 */
  	PG_SETMASK(&UnBlockSig);
  
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
- 		bool		do_checkpoint = false;
- 		int			flags = 0;
- 		pg_time_t	now;
- 		int			elapsed_secs;
- 
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
--- 365,381 ----
  	 */
  	PG_SETMASK(&UnBlockSig);
  
+ 	BgWriterRecoveryMode = IsRecoveryProcessingMode();
+ 
+ 	if (BgWriterRecoveryMode)
+ 		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
+ 			BgWriterShmem->bgwriter_pid);
+ 
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
***************
*** 383,501 ****
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
- 		if (checkpoint_requested)
- 		{
- 			checkpoint_requested = false;
- 			do_checkpoint = true;
- 			BgWriterStats.m_requested_checkpoints++;
- 		}
- 		if (shutdown_requested)
- 		{
- 			/*
- 			 * From here on, elog(ERROR) should end with exit(1), not send
- 			 * control back to the sigsetjmp block above
- 			 */
- 			ExitOnAnyError = true;
- 			/* Close down the database */
- 			ShutdownXLOG(0, 0);
- 			DumpFreeSpaceMap(0, 0);
- 			/* Normal exit from the bgwriter is here */
- 			proc_exit(0);		/* done */
- 		}
  
! 		/*
! 		 * Force a checkpoint if too much time has elapsed since the last one.
! 		 * Note that we count a timed checkpoint in stats only when this
! 		 * occurs without an external request, but we set the CAUSE_TIME flag
! 		 * bit even if there is also an external request.
! 		 */
! 		now = (pg_time_t) time(NULL);
! 		elapsed_secs = now - last_checkpoint_time;
! 		if (elapsed_secs >= CheckPointTimeout)
  		{
! 			if (!do_checkpoint)
! 				BgWriterStats.m_timed_checkpoints++;
! 			do_checkpoint = true;
! 			flags |= CHECKPOINT_CAUSE_TIME;
  		}
! 
! 		/*
! 		 * Do a checkpoint if requested, otherwise do one cycle of
! 		 * dirty-buffer writing.
! 		 */
! 		if (do_checkpoint)
  		{
! 			/* use volatile pointer to prevent code rearrangement */
! 			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
  
  			/*
! 			 * Atomically fetch the request flags to figure out what kind of a
! 			 * checkpoint we should perform, and increase the started-counter
! 			 * to acknowledge that we've started a new checkpoint.
  			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			flags |= bgs->ckpt_flags;
! 			bgs->ckpt_flags = 0;
! 			bgs->ckpt_started++;
! 			SpinLockRelease(&bgs->ckpt_lck);
! 
! 			/*
! 			 * We will warn if (a) too soon since last checkpoint (whatever
! 			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 			 * since the last checkpoint start.  Note in particular that this
! 			 * implementation will not generate warnings caused by
! 			 * CheckPointTimeout < CheckPointWarning.
! 			 */
! 			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 				elapsed_secs < CheckPointWarning)
! 				ereport(LOG,
! 						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 								elapsed_secs),
! 						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
! 
! 			/*
! 			 * Initialize bgwriter-private variables used during checkpoint.
! 			 */
! 			ckpt_active = true;
! 			ckpt_start_recptr = GetInsertRecPtr();
! 			ckpt_start_time = now;
! 			ckpt_cached_elapsed = 0;
! 
! 			/*
! 			 * Do the checkpoint.
! 			 */
! 			CreateCheckPoint(flags);
  
  			/*
! 			 * After any checkpoint, close all smgr files.	This is so we
! 			 * won't hang onto smgr references to deleted files indefinitely.
  			 */
! 			smgrcloseall();
  
! 			/*
! 			 * Indicate checkpoint completion to any waiting backends.
! 			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			bgs->ckpt_done = bgs->ckpt_started;
! 			SpinLockRelease(&bgs->ckpt_lck);
! 
! 			ckpt_active = false;
  
! 			/*
! 			 * Note we record the checkpoint start time not end time as
! 			 * last_checkpoint_time.  This is so that time-driven checkpoints
! 			 * happen at a predictable spacing.
! 			 */
! 			last_checkpoint_time = now;
  		}
- 		else
- 			BgBufferSync();
- 
- 		/* Check for archive_timeout and switch xlog files if necessary. */
- 		CheckArchiveTimeout();
- 
- 		/* Nap for the configured time. */
- 		BgWriterNap();
  	}
  }
  
--- 393,592 ----
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
  
! 		if (BgWriterRecoveryMode)
  		{
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
! 
! 			if (!IsRecoveryProcessingMode())
! 			{
! 				elog(DEBUG2, "bgwriter changing from recovery to normal mode");
! 
! 				InitXLOGAccess();
! 				BgWriterRecoveryMode = false;
! 
! 				/*
! 				 * Start time-driven events from now
! 				 */
! 				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
! 
! 				continue;
! 			}
! 
! 			if (checkpoint_requested) 
! 			{
! 				XLogRecPtr		ReadPtr;
! 				CheckPoint		restartPoint;
! 
! 				checkpoint_requested = false;
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_time = (pg_time_t) time(NULL);
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Get the requested values from shared memory that the 
! 				 * Startup process has put there for us.
! 				 */
! 				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
! 				ReadPtr = BgWriterShmem->ReadPtr;
! 				memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
! 				SpinLockRelease(&BgWriterShmem->ckpt_lck);
! 
! 				CreateRestartPoint(ReadPtr, &restartPoint);
! 
! 				ckpt_active = false;
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 			}
! 			else
! 			{
! 				/* Clean buffers dirtied by recovery */
! 				BgBufferSync(true);
! 
! 				/* Nap for the configured time. */
! 				BgWriterNap();
! 			}
  		}
! 		else	/* Normal processing */
  		{
! 			bool		do_checkpoint = false;
! 			int			flags = 0;
! 			pg_time_t	now;
! 			int			elapsed_secs;
! 
! 			Assert(!IsRecoveryProcessingMode());
! 
! 			if (checkpoint_requested) 
! 			{
! 				checkpoint_requested = false;
! 				do_checkpoint = true;
! 				BgWriterStats.m_requested_checkpoints++;
! 			}
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Close down the database */
! 				ShutdownXLOG(0, 0);
! 				DumpFreeSpaceMap(0, 0);
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  
  			/*
! 			 * Force a checkpoint if too much time has elapsed since the last one.
! 			 * Note that we count a timed checkpoint in stats only when this
! 			 * occurs without an external request, but we set the CAUSE_TIME flag
! 			 * bit even if there is also an external request.
  			 */
! 			now = (pg_time_t) time(NULL);
! 			elapsed_secs = now - last_checkpoint_time;
! 			if (elapsed_secs >= CheckPointTimeout)
! 			{
! 				if (!do_checkpoint)
! 					BgWriterStats.m_timed_checkpoints++;
! 				do_checkpoint = true;
! 				flags |= CHECKPOINT_CAUSE_TIME;
! 			}
  
  			/*
! 			 * Do a checkpoint if requested, otherwise do one cycle of
! 			 * dirty-buffer writing.
  			 */
! 			if (do_checkpoint)
! 			{
! 				/* use volatile pointer to prevent code rearrangement */
! 				volatile BgWriterShmemStruct *bgs = BgWriterShmem;
! 
! 				/*
! 				 * Atomically fetch the request flags to figure out what kind of a
! 				 * checkpoint we should perform, and increase the started-counter
! 				 * to acknowledge that we've started a new checkpoint.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				flags |= bgs->ckpt_flags;
! 				bgs->ckpt_flags = 0;
! 				bgs->ckpt_started++;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				/*
! 				 * We will warn if (a) too soon since last checkpoint (whatever
! 				 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 				 * since the last checkpoint start.  Note in particular that this
! 				 * implementation will not generate warnings caused by
! 				 * CheckPointTimeout < CheckPointWarning.
! 				 */
! 				if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 					elapsed_secs < CheckPointWarning)
! 					ereport(LOG,
! 							(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 									elapsed_secs),
! 							 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_recptr = GetInsertRecPtr();
! 				ckpt_start_time = now;
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Do the checkpoint.
! 				 */
! 				CreateCheckPoint(flags);
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 
! 				/*
! 				 * Indicate checkpoint completion to any waiting backends.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				bgs->ckpt_done = bgs->ckpt_started;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				ckpt_active = false;
! 
! 				/*
! 				 * Note we record the checkpoint start time not end time as
! 				 * last_checkpoint_time.  This is so that time-driven checkpoints
! 				 * happen at a predictable spacing.
! 				 */
! 				last_checkpoint_time = now;
! 			}
! 			else
! 				BgBufferSync(false);
  
! 			/* Check for archive_timeout and switch xlog files if necessary. */
! 			CheckArchiveTimeout();
  
! 			/* Nap for the configured time. */
! 			BgWriterNap();
  		}
  	}
  }
  
***************
*** 588,594 ****
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
--- 679,686 ----
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		if (!IsRecoveryProcessingMode())
! 			AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
***************
*** 642,647 ****
--- 734,751 ----
  	if (!am_bg_writer)
  		return;
  
+ 	/* Perform minimal duties during recovery and skip wait if requested */
+ 	if (IsRecoveryProcessingMode())
+ 	{
+ 		BgBufferSync(true);
+ 
+ 		if (!shutdown_requested &&
+ 			IsCheckpointOnSchedule(progress))
+ 			BgWriterNap();
+ 
+ 		return;
+ 	}
+ 
  	/*
  	 * Perform the usual bgwriter duties and take a nap, unless we're behind
  	 * schedule, in which case we just try to catch up as quickly as possible.
***************
*** 660,666 ****
  		AbsorbFsyncRequests();
  		absorb_counter = WRITES_PER_ABSORB;
  
! 		BgBufferSync();
  		CheckArchiveTimeout();
  		BgWriterNap();
  	}
--- 764,770 ----
  		AbsorbFsyncRequests();
  		absorb_counter = WRITES_PER_ABSORB;
  
! 		BgBufferSync(false);
  		CheckArchiveTimeout();
  		BgWriterNap();
  	}
***************
*** 716,731 ****
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	recptr = GetInsertRecPtr();
! 	elapsed_xlogs =
! 		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 		CheckPointSegments;
! 
! 	if (progress < elapsed_xlogs)
  	{
! 		ckpt_cached_elapsed = elapsed_xlogs;
! 		return false;
  	}
  
  	/*
--- 820,838 ----
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	if (!IsRecoveryProcessingMode())
  	{
! 		recptr = GetInsertRecPtr();
! 		elapsed_xlogs =
! 			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 			CheckPointSegments;
! 
! 		if (progress < elapsed_xlogs)
! 		{
! 			ckpt_cached_elapsed = elapsed_xlogs;
! 			return false;
! 		}
  	}
  
  	/*
***************
*** 967,972 ****
--- 1074,1109 ----
  }
  
  /*
+  * Always runs in Startup process (see xlog.c)
+  */
+ void
+ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+ {
+ 	/*
+ 	 * Should we just do it ourselves?
+ 	 */
+ 	if (!IsPostmasterEnvironment || !sendToBGWriter)
+ 	{
+ 		CreateRestartPoint(ReadPtr, restartPoint);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Push requested values into shared memory, then signal to request restartpoint.
+ 	 */
+ 	if (BgWriterShmem->bgwriter_pid == 0)
+ 		elog(LOG, "could not request restartpoint because bgwriter not running");
+ 
+ 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ 	BgWriterShmem->ReadPtr = ReadPtr;
+ 	memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+ 	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+ 
+ 	if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ 		elog(LOG, "could not signal for restartpoint: %m");	
+ }
+ 
+ /*
   * ForwardFsyncRequest
   *		Forward a file-fsync request from a backend to the bgwriter
   *
Index: src/backend/postmaster/postmaster.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/postmaster/postmaster.c,v
retrieving revision 1.561
diff -c -r1.561 postmaster.c
*** src/backend/postmaster/postmaster.c	26 Jun 2008 02:47:19 -0000	1.561
--- src/backend/postmaster/postmaster.c	22 Sep 2008 20:25:36 -0000
***************
*** 254,259 ****
--- 254,264 ----
  {
  	PM_INIT,					/* postmaster starting */
  	PM_STARTUP,					/* waiting for startup subprocess */
+ 	PM_RECOVERY,				/* consistent recovery mode; state only
+ 								 * entered for archive and streaming recovery,
+ 								 * and only after the point where the 
+ 								 * all data is in consistent state.
+ 								 */
  	PM_RUN,						/* normal "database is alive" state */
  	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
  	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
***************
*** 1294,1300 ****
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && pmState == PM_RUN)
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
--- 1299,1305 ----
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
***************
*** 2104,2110 ****
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
--- 2109,2115 ----
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
***************
*** 2145,2155 ****
  			load_role();
  
  			/*
! 			 * Crank up the background writer.	It doesn't matter if this
! 			 * fails, we'll just try again later.
  			 */
! 			Assert(BgWriterPID == 0);
! 			BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
--- 2150,2160 ----
  			load_role();
  
  			/*
! 			 * Check whether we need to start background writer, if not
! 			 * already running.
  			 */
! 			if (BgWriterPID == 0)
! 				BgWriterPID = StartBackgroundWriter();
  
  			/*
  			 * Likewise, start other special children as needed.  In a restart
***************
*** 3821,3826 ****
--- 3826,3876 ----
  
  	PG_SETMASK(&BlockSig);
  
+ 	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ 	{
+ 		Assert(pmState == PM_STARTUP);
+ 
+ 		/*
+ 		 * Go to shutdown mode if a shutdown request was pending.
+ 		 */
+ 		if (Shutdown > NoShutdown)
+ 		{
+ 			pmState = PM_WAIT_BACKENDS;
+ 			/* PostmasterStateMachine logic does the rest */
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * Startup process has entered recovery
+ 			 */
+ 			pmState = PM_RECOVERY;
+ 
+ 			/*
+ 			 * Load the flat authorization file into postmaster's cache. The
+ 			 * startup process won't have recomputed this from the database yet,
+ 			 * so we it may change following recovery. 
+ 			 */
+ 			load_role();
+ 
+ 			/*
+ 			 * Crank up the background writer.	It doesn't matter if this
+ 			 * fails, we'll just try again later.
+ 			 */
+ 			Assert(BgWriterPID == 0);
+ 			BgWriterPID = StartBackgroundWriter();
+ 
+ 			/*
+ 			 * Likewise, start other special children as needed.
+ 			 */
+ 			Assert(PgStatPID == 0);
+ 			PgStatPID = pgstat_start();
+ 
+ 			/* XXX at this point we could accept read-only connections */
+ 			ereport(DEBUG1,
+ 				 (errmsg("database system is in consistent recovery mode")));
+ 		}
+ 	}
+ 
  	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
  	{
  		/*
Index: src/backend/storage/buffer/README
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/README,v
retrieving revision 1.14
diff -c -r1.14 README
*** src/backend/storage/buffer/README	21 Mar 2008 13:23:28 -0000	1.14
--- src/backend/storage/buffer/README	22 Sep 2008 21:54:20 -0000
***************
*** 264,266 ****
--- 264,271 ----
  This ensures that the page image transferred to disk is reasonably consistent.
  We might miss a hint-bit update or two but that isn't a problem, for the same
  reasons mentioned under buffer access rules.
+ 
+ As of 8.4, background writer starts during recovery mode when there is
+ some form of potentially extended recovery to perform. It performs an
+ identical service to normal processing, except that checkpoints it
+ writes are technically restartpoints nor needs to flush WAL for dirty buffers.
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.237
diff -c -r1.237 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	11 Aug 2008 11:05:11 -0000	1.237
--- src/backend/storage/buffer/bufmgr.c	22 Sep 2008 21:01:02 -0000
***************
*** 1080,1086 ****
   *
   * This is called at checkpoint time to write out all dirty shared buffers.
   * The checkpoint request flags should be passed in; currently the only one
!  * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
   */
  static void
  BufferSync(int flags)
--- 1080,1087 ----
   *
   * This is called at checkpoint time to write out all dirty shared buffers.
   * The checkpoint request flags should be passed in; currently the only one
!  * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes,
!  * except for restartpoints, which use CHECKPOINT_RESTARTPOINT. 
   */
  static void
  BufferSync(int flags)
***************
*** 1089,1094 ****
--- 1090,1096 ----
  	int			num_to_scan;
  	int			num_to_write;
  	int			num_written;
+ 	bool		cleanup_and_exit = false;
  
  	/* Make sure we can handle the pin inside SyncOneBuffer */
  	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
***************
*** 1163,1169 ****
  		 */
  		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
! 			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
  				TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
  				BgWriterStats.m_buf_written_checkpoints++;
--- 1165,1180 ----
  		 */
  		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
! 			if (cleanup_and_exit)
! 			{
! 				LockBufHdr(bufHdr);
! 
! 				if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
! 					bufHdr->flags &= ~BM_CHECKPOINT_NEEDED;
! 
! 				UnlockBufHdr(bufHdr);
! 			}
! 			else if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
  			{
  				TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
  				BgWriterStats.m_buf_written_checkpoints++;
***************
*** 1194,1199 ****
--- 1205,1218 ----
  
  		if (++buf_id >= NBuffers)
  			buf_id = 0;
+ 
+ 		/*
+ 		 * Quit scanning if state changes while we're here. If it
+ 		 * does we want to stop performing the current restartpoint
+ 		 * as quickly as possible. We'll write a real checkpoint instead.
+ 		 */
+ 		if (flags & CHECKPOINT_RESTARTPOINT && !IsRecoveryProcessingMode())
+ 			cleanup_and_exit = true;
  	}
  
  	TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write);
***************
*** 1211,1217 ****
   * This is called periodically by the background writer process.
   */
  void
! BgBufferSync(void)
  {
  	/* info obtained from freelist.c */
  	int			strategy_buf_id;
--- 1230,1236 ----
   * This is called periodically by the background writer process.
   */
  void
! BgBufferSync(bool calledInRecovery)
  {
  	/* info obtained from freelist.c */
  	int			strategy_buf_id;
***************
*** 1423,1428 ****
--- 1442,1456 ----
  	{
  		int			buffer_state = SyncOneBuffer(next_to_clean, true);
  
+ 		/*
+ 		 * Quit scanning if state changes while we're here, otherwise
+ 		 * we might skip writing WAL for a buffer that was modified
+ 		 * by a transaction that committed very soon after recovery
+ 		 * mode completes.
+ 		 */
+ 		if (calledInRecovery && !IsRecoveryProcessingMode())
+ 			break;
+ 
  		if (++next_to_clean >= NBuffers)
  		{
  			next_to_clean = 0;
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog.h,v
retrieving revision 1.88
diff -c -r1.88 xlog.h
*** src/include/access/xlog.h	12 May 2008 08:35:05 -0000	1.88
--- src/include/access/xlog.h	22 Sep 2008 19:04:56 -0000
***************
*** 133,139 ****
  } XLogRecData;
  
  extern TimeLineID ThisTimeLineID;		/* current TLI */
! extern bool InRecovery;
  extern XLogRecPtr XactLastRecEnd;
  
  /* these variables are GUC parameters related to XLOG */
--- 133,148 ----
  } XLogRecData;
  
  extern TimeLineID ThisTimeLineID;		/* current TLI */
! 
! /* 
!  * Prior to 8.4, all activity during recovery were carried out by Startup
!  * process. This local variable continues to be used in many parts of the
!  * code to indicate actions taken by RecoveryManagers. Other processes who
!  * potentially perform work during recovery should check
!  * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
!  */
! extern bool InRecovery;	
! 										
  extern XLogRecPtr XactLastRecEnd;
  
  /* these variables are GUC parameters related to XLOG */
***************
*** 166,171 ****
--- 175,181 ----
  /* These indicate the cause of a checkpoint request */
  #define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
  #define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
+ #define CHECKPOINT_RESTARTPOINT	0x0040	/* Restartpoint during recovery */
  
  /* Checkpoint statistics */
  typedef struct CheckpointStatsData
***************
*** 197,202 ****
--- 207,214 ----
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
  
+ extern bool IsRecoveryProcessingMode(void);
+ 
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
  extern void XLOGShmemInit(void);
Index: src/include/access/xlog_internal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/access/xlog_internal.h,v
retrieving revision 1.24
diff -c -r1.24 xlog_internal.h
*** src/include/access/xlog_internal.h	11 Aug 2008 11:05:11 -0000	1.24
--- src/include/access/xlog_internal.h	22 Sep 2008 20:26:56 -0000
***************
*** 17,22 ****
--- 17,23 ----
  #define XLOG_INTERNAL_H
  
  #include "access/xlog.h"
+ #include "catalog/pg_control.h"
  #include "fmgr.h"
  #include "pgtime.h"
  #include "storage/block.h"
***************
*** 245,250 ****
--- 246,253 ----
  extern pg_time_t GetLastSegSwitchTime(void);
  extern XLogRecPtr RequestXLogSwitch(void);
  
+ extern void CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint);
+ 
  /*
   * These aren't in xlog.h because I'd rather not include fmgr.h there.
   */
Index: src/include/catalog/pg_control.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/catalog/pg_control.h,v
retrieving revision 1.41
diff -c -r1.41 pg_control.h
*** src/include/catalog/pg_control.h	21 Apr 2008 00:26:47 -0000	1.41
--- src/include/catalog/pg_control.h	22 Sep 2008 19:21:37 -0000
***************
*** 46,52 ****
  #define XLOG_NOOP						0x20
  #define XLOG_NEXTOID					0x30
  #define XLOG_SWITCH						0x40
! 
  
  /* System status indicator */
  typedef enum DBState
--- 46,52 ----
  #define XLOG_NOOP						0x20
  #define XLOG_NEXTOID					0x30
  #define XLOG_SWITCH						0x40
! #define XLOG_RECOVERY_END			0x50
  
  /* System status indicator */
  typedef enum DBState
Index: src/include/postmaster/bgwriter.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/postmaster/bgwriter.h,v
retrieving revision 1.12
diff -c -r1.12 bgwriter.h
*** src/include/postmaster/bgwriter.h	11 Aug 2008 11:05:11 -0000	1.12
--- src/include/postmaster/bgwriter.h	22 Sep 2008 15:53:22 -0000
***************
*** 12,17 ****
--- 12,18 ----
  #ifndef _BGWRITER_H
  #define _BGWRITER_H
  
+ #include "catalog/pg_control.h"
  #include "storage/block.h"
  #include "storage/relfilenode.h"
  
***************
*** 25,30 ****
--- 26,32 ----
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
  extern void CheckpointWriteDelay(int flags, double progress);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.115
diff -c -r1.115 bufmgr.h
*** src/include/storage/bufmgr.h	11 Aug 2008 11:05:11 -0000	1.115
--- src/include/storage/bufmgr.h	22 Sep 2008 18:36:54 -0000
***************
*** 193,199 ****
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BgBufferSync(void);
  
  extern void AtProcExit_LocalBuffers(void);
  
--- 193,199 ----
  extern void AbortBufferIO(void);
  
  extern void BufmgrCommit(void);
! extern void BgBufferSync(bool calledInRecovery);
  
  extern void AtProcExit_LocalBuffers(void);
  
Index: src/include/storage/pmsignal.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/pmsignal.h,v
retrieving revision 1.20
diff -c -r1.20 pmsignal.h
*** src/include/storage/pmsignal.h	19 Jun 2008 21:32:56 -0000	1.20
--- src/include/storage/pmsignal.h	22 Sep 2008 15:54:57 -0000
***************
*** 22,27 ****
--- 22,28 ----
   */
  typedef enum
  {
+ 	PMSIGNAL_RECOVERY_START,	/* move to PM_RECOVERY state */
  	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
  	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
  	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [PATCHES] [HACKERS] Infrastructure changes for recovery

Reply via email to