Hi, Here is a summary of my work during the last few days on this demote approach.
Please, find in attachment v2-0001-Demote-PoC.patch and the comments in the commit message and as FIXME in code. The patch is not finished or bug-free yet, I'm still not very happy with the coding style, it probably lack some more code documentation, but a lot has changed since v1. It's still a PoC to push the discussion a bit further after being myself silent for some days. The patch is currently relying on a demote checkpoint. I understand a forced checkpoint overhead can be massive and cause major wait/downtime. But I keep this for a later step. Maybe we should be able to cancel a running checkpoint? Or leave it to its synching work but discard the result without wirting it to XLog? I hadn't time to investigate Robert's concern about shared memory for snapshot during recovery. The patch doesn't deal with prepared xact yet. Testing "start->demote->promote" raise an assert if some prepared xact exist. I suppose I will rollback them during demote in next patch version. I'm not sure how to divide this patch in multiple small independent steps. I suppose I can split it like: 1. add demote checkpoint 2. support demote: mostly postmaster, startup/xlog and checkpointer related code 3. cli using pg_ctl demote ...But I'm not sure it worth it. Regards,
>From 03c41dd706648cd20df90a128db64eee6b6dad97 Mon Sep 17 00:00:00 2001 From: Jehan-Guillaume de Rorthais <j...@dalibo.com> Date: Fri, 10 Apr 2020 18:01:45 +0200 Subject: [PATCH] Demote PoC Changes: * creates a demote checkpoint * use DB_DEMOTING state in controlfile * try to handle subsystems init correctly during demote * keep some sub-processes alive: stat collector, checkpointer, bgwriter and optionally archiver or wal senders * add signal PMSIGNAL_DEMOTING to start the startup process after the demote checkpoint * ShutdownXLOG takes a boolean arg to handle demote differently Trivial manual tests: * make check : OK * make check-world : OK * start in production -> demote -> demote: OK * start in production -> demote -> stop : OK * start in production -> demote -> promote : NOK (2PC, see TODO) but OK with no prepared xact. Discuss/Todo: * rollback prepared xact * cancel/kill active/idle in xact R/W backends * pg_demote() function? * some more code reviewing around StartupXlog * investigate snapshots shmem needs/init during recovery compare to production * add tap tests * add doc * how to handle checkpoint? --- src/backend/access/rmgrdesc/xlogdesc.c | 9 +- src/backend/access/transam/xlog.c | 287 +++++++++++++++--------- src/backend/postmaster/checkpointer.c | 22 ++ src/backend/postmaster/postmaster.c | 250 ++++++++++++++++----- src/backend/storage/ipc/procsignal.c | 4 + src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_ctl/pg_ctl.c | 111 +++++++++ src/include/access/xlog.h | 18 +- src/include/catalog/pg_control.h | 2 + src/include/libpq/libpq-be.h | 7 +- src/include/postmaster/bgwriter.h | 1 + src/include/storage/pmsignal.h | 1 + src/include/storage/procsignal.h | 1 + src/include/utils/pidfile.h | 1 + 14 files changed, 537 insertions(+), 179 deletions(-) diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 1cd97852e8..5aeaff18f8 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -40,7 +40,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; if (info == XLOG_CHECKPOINT_SHUTDOWN || - info == XLOG_CHECKPOINT_ONLINE) + info == XLOG_CHECKPOINT_ONLINE || + info == XLOG_CHECKPOINT_DEMOTE) { CheckPoint *checkpoint = (CheckPoint *) rec; @@ -65,7 +66,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->oldestCommitTsXid, checkpoint->newestCommitTsXid, checkpoint->oldestActiveXid, - (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online"); + (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : + (info == XLOG_CHECKPOINT_DEMOTE)? "demote" : "online"); } else if (info == XLOG_NEXTOID) { @@ -185,6 +187,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_CHECKPOINT_DEMOTE: + id = "CHECKPOINT_DEMOTE"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e455384b5b..0e18e546ba 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6301,6 +6301,13 @@ CheckRequiredParameterValues(void) /* * This must be called ONCE during postmaster or standalone-backend startup */ +/* + * FIXME demote: part of the code here assume there's no other active + * processes before signal PMSIGNAL_RECOVERY_STARTED is sent. + * + * FIXME demote: rollback prepared xact during demote? + */ + void StartupXLOG(void) { @@ -6324,6 +6331,7 @@ StartupXLOG(void) XLogPageReadPrivate private; bool fast_promoted = false; struct stat st; + bool is_demoting = false; /* * We should have an aux process resource owner to use, and we should not @@ -6388,6 +6396,16 @@ StartupXLOG(void) str_time(ControlFile->time)))); break; + case DB_DEMOTING: + ereport(LOG, + (errmsg("database system was demoted at %s", + str_time(ControlFile->time)))); + is_demoting = true; + bgwriterLaunched = true; + InArchiveRecovery = true; + StandbyMode = true; + break; + default: ereport(FATAL, (errmsg("control file contains invalid database cluster state"))); @@ -6421,7 +6439,8 @@ StartupXLOG(void) * persisted. To avoid that, fsync the entire data directory. */ if (ControlFile->state != DB_SHUTDOWNED && - ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY && + ControlFile->state != DB_DEMOTING) { RemoveTempXlogFiles(); SyncDataDirectory(); @@ -6678,6 +6697,9 @@ StartupXLOG(void) } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + + if (is_demoting) + Assert((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_DEMOTE); } /* @@ -6739,9 +6761,9 @@ StartupXLOG(void) LastRec = RecPtr = checkPointLoc; ereport(DEBUG1, - (errmsg_internal("redo record is at %X/%X; shutdown %s", + (errmsg_internal("redo record is at %X/%X; %s checkpoint", (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo, - wasShutdown ? "true" : "false"))); + wasShutdown ? "shutdown" : is_demoting? "demote": ""))); ereport(DEBUG1, (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", U64FromFullTransactionId(checkPoint.nextFullXid), @@ -6775,47 +6797,74 @@ StartupXLOG(void) checkPoint.newestCommitTsXid); XLogCtl->ckptFullXid = checkPoint.nextFullXid; - /* - * Initialize replication slots, before there's a chance to remove - * required resources. - */ - StartupReplicationSlots(); + if (!is_demoting) + { + /* + * Initialize replication slots, before there's a chance to remove + * required resources. + */ + StartupReplicationSlots(); - /* - * Startup logical state, needs to be setup now so we have proper data - * during crash recovery. - */ - StartupReorderBuffer(); + /* + * Startup logical state, needs to be setup now so we have proper data + * during crash recovery. + */ + StartupReorderBuffer(); - /* - * Startup MultiXact. We need to do this early to be able to replay - * truncations. - */ - StartupMultiXact(); + /* + * Startup MultiXact. We need to do this early to be able to replay + * truncations. + */ + StartupMultiXact(); - /* - * Ditto for commit timestamps. Activate the facility if the setting is - * enabled in the control file, as there should be no tracking of commit - * timestamps done when the setting was disabled. This facility can be - * started or stopped when replaying a XLOG_PARAMETER_CHANGE record. - */ - if (ControlFile->track_commit_timestamp) - StartupCommitTs(); + /* + * Ditto for commit timestamps. Activate the facility if the setting is + * enabled in the control file, as there should be no tracking of commit + * timestamps done when the setting was disabled. This facility can be + * started or stopped when replaying a XLOG_PARAMETER_CHANGE record. + */ + if (ControlFile->track_commit_timestamp) + StartupCommitTs(); - /* - * Recover knowledge about replay progress of known replication partners. - */ - StartupReplicationOrigin(); + /* + * Recover knowledge about replay progress of known replication partners. + */ + StartupReplicationOrigin(); - /* - * Initialize unlogged LSN. On a clean shutdown, it's restored from the - * control file. On recovery, all unlogged relations are blown away, so - * the unlogged LSN counter can be reset too. - */ - if (ControlFile->state == DB_SHUTDOWNED) - XLogCtl->unloggedLSN = ControlFile->unloggedLSN; - else - XLogCtl->unloggedLSN = FirstNormalUnloggedLSN; + /* + * Initialize unlogged LSN. On a clean shutdown, it's restored from the + * control file. On recovery, all unlogged relations are blown away, so + * the unlogged LSN counter can be reset too. + */ + if (ControlFile->state == DB_SHUTDOWNED) + XLogCtl->unloggedLSN = ControlFile->unloggedLSN; + else + XLogCtl->unloggedLSN = FirstNormalUnloggedLSN; + + /* + * Copy any missing timeline history files between 'now' and the recovery + * target timeline from archive to pg_wal. While we don't need those files + * ourselves - the history file of the recovery target timeline covers all + * the previous timelines in the history too - a cascading standby server + * might be interested in them. Or, if you archive the WAL from this + * server to a different archive than the master, it'd be good for all the + * history files to get archived there after failover, so that you can use + * one of the old timelines as a PITR target. Timeline history files are + * small, so it's better to copy them unnecessarily than not copy them and + * regret later. + */ + restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI); + + /* + * Before running in recovery, scan pg_twophase and fill in its status to + * be able to work on entries generated by redo. Doing a scan before + * taking any recovery action has the merit to discard any 2PC files that + * are newer than the first record to replay, saving from any conflicts at + * replay. This avoids as well any subsequent scans when doing recovery + * of the on-disk two-phase data. + */ + restoreTwoPhaseData(); + } /* * We must replay WAL entries using the same TimeLineID they were created @@ -6824,30 +6873,6 @@ StartupXLOG(void) */ ThisTimeLineID = checkPoint.ThisTimeLineID; - /* - * Copy any missing timeline history files between 'now' and the recovery - * target timeline from archive to pg_wal. While we don't need those files - * ourselves - the history file of the recovery target timeline covers all - * the previous timelines in the history too - a cascading standby server - * might be interested in them. Or, if you archive the WAL from this - * server to a different archive than the master, it'd be good for all the - * history files to get archived there after failover, so that you can use - * one of the old timelines as a PITR target. Timeline history files are - * small, so it's better to copy them unnecessarily than not copy them and - * regret later. - */ - restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI); - - /* - * Before running in recovery, scan pg_twophase and fill in its status to - * be able to work on entries generated by redo. Doing a scan before - * taking any recovery action has the merit to discard any 2PC files that - * are newer than the first record to replay, saving from any conflicts at - * replay. This avoids as well any subsequent scans when doing recovery - * of the on-disk two-phase data. - */ - restoreTwoPhaseData(); - lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; @@ -6985,7 +7010,8 @@ StartupXLOG(void) /* * Reset pgstat data, because it may be invalid after recovery. */ - pgstat_reset_all(); + if (!is_demoting) + pgstat_reset_all(); /* * If there was a backup label file, it's done its job and the info @@ -7061,8 +7087,11 @@ StartupXLOG(void) * timestamp have already been started up and other SLRUs are not * maintained during recovery and need not be started yet. */ - StartupCLOG(); - StartupSUBTRANS(oldestActiveXID); + if (!is_demoting) + { + StartupCLOG(); + StartupSUBTRANS(oldestActiveXID); + } /* * If we're beginning at a shutdown checkpoint, we know that @@ -7070,7 +7099,7 @@ StartupXLOG(void) * empty running-xacts record and use that here and now. Recover * additional standby state for prepared transactions. */ - if (wasShutdown) + if (wasShutdown || is_demoting) { RunningTransactionsData running; TransactionId latestCompletedXid; @@ -7093,9 +7122,10 @@ StartupXLOG(void) running.xids = xids; ProcArrayApplyRecoveryInfo(&running); + } + if (wasShutdown) StandbyRecoverPreparedTransactions(); - } } /* Initialize resource managers */ @@ -7941,6 +7971,7 @@ StartupXLOG(void) SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE; + XLogCtl->SharedHotStandbyActive = false; SpinLockRelease(&XLogCtl->info_lck); UpdateControlFile(); @@ -8292,7 +8323,8 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, } info = record->xl_info & ~XLR_INFO_MASK; if (info != XLOG_CHECKPOINT_SHUTDOWN && - info != XLOG_CHECKPOINT_ONLINE) + info != XLOG_CHECKPOINT_ONLINE && + info != XLOG_CHECKPOINT_DEMOTE) { switch (whichChkpt) { @@ -8486,6 +8518,8 @@ GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN) void ShutdownXLOG(int code, Datum arg) { + bool isDemoting = DatumGetBool(arg); + /* * We should have an aux process resource owner to use, and we should not * be in a transaction that's installed some other resowner. @@ -8495,36 +8529,56 @@ ShutdownXLOG(int code, Datum arg) CurrentResourceOwner == AuxProcessResourceOwner); CurrentResourceOwner = AuxProcessResourceOwner; - /* Don't be chatty in standalone mode */ - ereport(IsPostmasterEnvironment ? LOG : NOTICE, - (errmsg("shutting down"))); - - /* - * Signal walsenders to move to stopping state. - */ - WalSndInitStopping(); - - /* - * Wait for WAL senders to be in stopping state. This prevents commands - * from writing new WAL. - */ - WalSndWaitStopping(); + if (isDemoting) + { + /* Don't be chatty in standalone mode */ + ereport(IsPostmasterEnvironment ? LOG : NOTICE, + (errmsg("demoting"))); - if (RecoveryInProgress()) - CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + /* + * FIXME demote: avoiding checkpoint? + * A checkpoint is probably running during a demote action. If + * we don't want to wait for the checkpoint during the demote, + * we might need to cancel it as it will not be able to write + * to the WAL after the demote. + */ + CreateCheckPoint(CHECKPOINT_IS_DEMOTE | CHECKPOINT_IMMEDIATE); + LocalRecoveryInProgress = true; + } else { + /* Don't be chatty in standalone mode */ + ereport(IsPostmasterEnvironment ? LOG : NOTICE, + (errmsg("shutting down"))); + /* - * If archiving is enabled, rotate the last XLOG file so that all the - * remaining records are archived (postmaster wakes up the archiver - * process one more time at the end of shutdown). The checkpoint - * record will go to the next XLOG file and won't be archived (yet). + * Signal walsenders to move to stopping state. */ - if (XLogArchivingActive() && XLogArchiveCommandSet()) - RequestXLogSwitch(false); + WalSndInitStopping(); + + /* + * Wait for WAL senders to be in stopping state. This prevents commands + * from writing new WAL. + */ + WalSndWaitStopping(); - CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + if (RecoveryInProgress()) + CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + else + { + /* + * If archiving is enabled, rotate the last XLOG file so that all the + * remaining records are archived (postmaster wakes up the archiver + * process one more time at the end of shutdown). The checkpoint + * record will go to the next XLOG file and won't be archived (yet). + */ + if (XLogArchivingActive() && XLogArchiveCommandSet()) + RequestXLogSwitch(false); + + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + } } + ShutdownCLOG(); ShutdownCommitTs(); ShutdownSUBTRANS(); @@ -8537,9 +8591,10 @@ ShutdownXLOG(int code, Datum arg) static void LogCheckpointStart(int flags, bool restartpoint) { - elog(LOG, "%s starting:%s%s%s%s%s%s%s%s", + elog(LOG, "%s starting:%s%s%s%s%s%s%s%s%s", restartpoint ? "restartpoint" : "checkpoint", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", + (flags & CHECKPOINT_IS_DEMOTE) ? " demote" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", @@ -8675,6 +8730,7 @@ UpdateCheckPointDistanceEstimate(uint64 nbytes) * * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. + * CHECKPOINT_IS_DEMOTE: checkpoint is for demote. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. @@ -8703,6 +8759,7 @@ void CreateCheckPoint(int flags) { bool shutdown; + bool demote; CheckPoint checkPoint; XLogRecPtr recptr; XLogSegNo _logSegNo; @@ -8723,6 +8780,14 @@ CreateCheckPoint(int flags) else shutdown = false; + /* + * An demote checkpoint is kind of a shutdown checkpoint as well + */ + if (flags & CHECKPOINT_IS_DEMOTE) + demote = true; + else + demote = false; + /* sanity check */ if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0) elog(ERROR, "can't create a checkpoint during recovery"); @@ -8760,10 +8825,10 @@ CreateCheckPoint(int flags) */ START_CRIT_SECTION(); - if (shutdown) + if (shutdown || demote) { LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->state = DB_SHUTDOWNING; + ControlFile->state = demote? DB_DEMOTING:DB_SHUTDOWNING; ControlFile->time = (pg_time_t) time(NULL); UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -8809,7 +8874,7 @@ CreateCheckPoint(int flags) * avoid inserting duplicate checkpoints when the system is idle. */ if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_FORCE)) == 0) + CHECKPOINT_IS_DEMOTE | CHECKPOINT_FORCE)) == 0) { if (last_important_lsn == ControlFile->checkPoint) { @@ -8980,7 +9045,7 @@ CreateCheckPoint(int flags) * If we are shutting down, or Startup process is completing crash * recovery we don't need to write running xact data. */ - if (!shutdown && XLogStandbyInfoActive()) + if (!(shutdown||demote) && XLogStandbyInfoActive()) LogStandbySnapshot(); START_CRIT_SECTION(); @@ -8990,20 +9055,23 @@ CreateCheckPoint(int flags) */ XLogBeginInsert(); XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint)); - recptr = XLogInsert(RM_XLOG_ID, - shutdown ? XLOG_CHECKPOINT_SHUTDOWN : - XLOG_CHECKPOINT_ONLINE); + if (demote) + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_DEMOTE); + else if (shutdown) + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_SHUTDOWN); + else + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_ONLINE); XLogFlush(recptr); /* - * We mustn't write any new WAL after a shutdown checkpoint, or it will be - * overwritten at next startup. No-one should even try, this just allows - * sanity-checking. In the case of an end-of-recovery checkpoint, we want - * to just temporarily disable writing until the system has exited - * recovery. + * We mustn't write any new WAL after a shutdown or demote checkpoint, or + * it will be overwritten at next startup. No-one should even try, this + * just allows sanity-checking. In the case of an end-of-recovery + * checkpoint, we want to just temporarily disable writing until the system + * has exited recovery. */ - if (shutdown) + if (shutdown||demote) { if (flags & CHECKPOINT_END_OF_RECOVERY) LocalXLogInsertAllowed = -1; /* return to "check" state */ @@ -9015,9 +9083,10 @@ CreateCheckPoint(int flags) * We now have ProcLastRecPtr = start of actual checkpoint record, recptr * = end of actual checkpoint record. */ - if (shutdown && checkPoint.redo != ProcLastRecPtr) + if ((shutdown||demote) && checkPoint.redo != ProcLastRecPtr) ereport(PANIC, - (errmsg("concurrent write-ahead log activity while database system is shutting down"))); + (errmsg("concurrent write-ahead log activity while database system is %s", + shutdown? "shutting down":"demoting"))); /* * Remember the prior checkpoint's redo ptr for @@ -9087,7 +9156,7 @@ CreateCheckPoint(int flags) * Make more log segments if needed. (Do this after recycling old log * segments, since that may supply some of the needed files.) */ - if (!shutdown) + if (!(shutdown||demote)) PreallocXlogFiles(recptr); /* diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 624a3238b8..cf8ea2a601 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -52,6 +52,7 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" +#include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/shmem.h" @@ -151,6 +152,7 @@ double CheckPointCompletionTarget = 0.5; * Private state */ static bool ckpt_active = false; +static volatile sig_atomic_t demoteRequestPending = false; /* these values are valid when ckpt_active is true: */ static pg_time_t ckpt_start_time; @@ -552,6 +554,14 @@ HandleCheckpointerInterrupts(void) */ UpdateSharedMemoryConfig(); } + if (demoteRequestPending) + { + demoteRequestPending = false; + /* Close down the database */ + ShutdownXLOG(0, BoolGetDatum(true)); + SendPostmasterSignal(PMSIGNAL_DEMOTING); + /* no need to exit the checkpointer during demote */ + } if (ShutdownRequestPending) { /* @@ -680,6 +690,7 @@ CheckpointWriteDelay(int flags, double progress) * in which case we just try to catch up as quickly as possible. */ if (!(flags & CHECKPOINT_IMMEDIATE) && + !demoteRequestPending && !ShutdownRequestPending && !ImmediateCheckpointRequested() && IsCheckpointOnSchedule(progress)) @@ -812,6 +823,17 @@ IsCheckpointOnSchedule(double progress) * -------------------------------- */ +/* SIGUSR1: set flag to demote */ +void +ReqCheckpointDemoteHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + demoteRequestPending = true; + + errno = save_errno; +} + /* SIGINT: set flag to run a normal checkpoint right away */ static void ReqCheckpointHandler(SIGNAL_ARGS) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b4d475bb0b..d5cc63f697 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -150,6 +150,9 @@ #define BACKEND_TYPE_WORKER (BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER) +/* file to signal demotion from primary to standby */ +#define DEMOTE_SIGNAL_FILE "demote" + /* * List of active backends (or child processes anyway; we don't actually * know whether a given child has become a backend or is still in the @@ -269,18 +272,23 @@ typedef enum static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING; /* Startup/shutdown state */ -#define NoShutdown 0 -#define SmartShutdown 1 -#define FastShutdown 2 -#define ImmediateShutdown 3 - -static int Shutdown = NoShutdown; +typedef enum StepDownState { + NoShutdown = 0, /* find better label? */ + SmartShutdown, + SmartDemote, + FastShutdown, + FastDemote, + ImmediateShutdown +} StepDownState; + +static StepDownState StepDown = NoShutdown; +static bool DemoteSignal = false; /* true on demote request */ static bool FatalError = false; /* T if recovering from backend crash */ /* - * We use a simple state machine to control startup, shutdown, and - * crash recovery (which is rather like shutdown followed by startup). + * We use a simple state machine to control startup, shutdown, demote and + * crash recovery (both are rather like shutdown followed by startup). * * After doing all the postmaster initialization work, we enter PM_STARTUP * state and the startup process is launched. The startup process begins by @@ -314,7 +322,7 @@ static bool FatalError = false; /* T if recovering from backend crash */ * will not be very long). * * Notice that this state variable does not distinguish *why* we entered - * states later than PM_RUN --- Shutdown and FatalError must be consulted + * states later than PM_RUN --- StepDown and FatalError must be consulted * to find that out. FatalError is never true in PM_RECOVERY_* or PM_RUN * states, nor in PM_SHUTDOWN states (because we don't enter those states * when trying to recover from a crash). It can be true in PM_STARTUP state, @@ -414,6 +422,8 @@ static bool RandomCancelKey(int32 *cancel_key); static void signal_child(pid_t pid, int signal); static bool SignalSomeChildren(int signal, int targets); static void TerminateChildren(int signal); +static bool CheckDemoteSignal(void); + #define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL) @@ -1550,7 +1560,7 @@ DetermineSleepTime(struct timeval *timeout) * Normal case: either there are no background workers at all, or we're in * a shutdown sequence (during which we ignore bgworkers altogether). */ - if (Shutdown > NoShutdown || + if (StepDown > NoShutdown || (!StartWorkerNeeded && !HaveCrashedWorker)) { if (AbortStartTime != 0) @@ -1830,7 +1840,7 @@ ServerLoop(void) * * Note we also do this during recovery from a process crash. */ - if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) && + if ((StepDown >= ImmediateShutdown || (FatalError && !SendStop)) && AbortStartTime != 0 && (now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS) { @@ -2305,6 +2315,11 @@ retry1: (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("the database system is starting up"))); break; + case CAC_DEMOTE: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is demoting"))); + break; case CAC_SHUTDOWN: ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), @@ -2436,7 +2451,7 @@ canAcceptConnections(int backend_type) CAC_state result = CAC_OK; /* - * Can't start backends when in startup/shutdown/inconsistent recovery + * Can't start backends when in startup/demote/shutdown/inconsistent recovery * state. We treat autovac workers the same as user backends for this * purpose. However, bgworkers are excluded from this test; we expect * bgworker_should_start_now() decided whether the DB state allows them. @@ -2452,7 +2467,9 @@ canAcceptConnections(int backend_type) { if (pmState == PM_WAIT_BACKUP) result = CAC_WAITBACKUP; /* allow superusers only */ - else if (Shutdown > NoShutdown) + else if (StepDown == SmartDemote || StepDown == FastDemote) + return CAC_DEMOTE; /* demote is pending */ + else if (StepDown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ else if (!FatalError && (pmState == PM_STARTUP || @@ -2683,7 +2700,8 @@ SIGHUP_handler(SIGNAL_ARGS) PG_SETMASK(&BlockSig); #endif - if (Shutdown <= SmartShutdown) + if (StepDown == NoShutdown || StepDown == SmartShutdown || + StepDown == SmartDemote) { ereport(LOG, (errmsg("received SIGHUP, reloading configuration files"))); @@ -2769,26 +2787,81 @@ pmdie(SIGNAL_ARGS) (errmsg_internal("postmaster received signal %d", postgres_signal_arg))); + if (CheckDemoteSignal()) + { + if (pmState != PM_RUN) + { + DemoteSignal = false; + unlink(DEMOTE_SIGNAL_FILE); + ereport(LOG, + (errmsg("ignoring demote signal because already in standby mode"))); + goto out; + } + else if (postgres_signal_arg == SIGQUIT) + { + DemoteSignal = false; + unlink(DEMOTE_SIGNAL_FILE); + ereport(WARNING, + (errmsg("can not demote in immediate stop mode"))); + goto out; + } + else + { + FILE *standby_file; + + DemoteSignal = true; + + unlink(DEMOTE_SIGNAL_FILE); + + /* create the standby signal file */ + standby_file = AllocateFile(STANDBY_SIGNAL_FILE, "w"); + if (!standby_file) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + STANDBY_SIGNAL_FILE))); + goto out; + } + + if (FreeFile(standby_file)) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + STANDBY_SIGNAL_FILE))); + goto out; + } + } + } + switch (postgres_signal_arg) { case SIGTERM: /* - * Smart Shutdown: + * Smart Stepdown: * - * Wait for children to end their work, then shut down. + * Wait for children to end their work, then shut down or demote. */ - if (Shutdown >= SmartShutdown) + if (StepDown >= SmartShutdown) break; - Shutdown = SmartShutdown; - ereport(LOG, - (errmsg("received smart shutdown request"))); - /* Report status */ - AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); + if (DemoteSignal) { + StepDown = SmartDemote; + ereport(LOG, (errmsg("received smart demote request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_DEMOTING); + } + else { + StepDown = SmartShutdown; + ereport(LOG, (errmsg("received smart shutdown request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); #ifdef USE_SYSTEMD - sd_notify(0, "STOPPING=1"); + sd_notify(0, "STOPPING=1"); #endif + } if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_STARTUP) @@ -2831,22 +2904,29 @@ pmdie(SIGNAL_ARGS) case SIGINT: /* - * Fast Shutdown: + * Fast StepDown: * * Abort all children with SIGTERM (rollback active transactions - * and exit) and shut down when they are gone. + * and exit) and shut down or demote when they are gone. */ - if (Shutdown >= FastShutdown) + if (StepDown >= FastShutdown) break; - Shutdown = FastShutdown; - ereport(LOG, - (errmsg("received fast shutdown request"))); - /* Report status */ - AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); + if (DemoteSignal) { + StepDown = FastDemote; + ereport(LOG, (errmsg("received fast demote request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_DEMOTING); + } + else { + StepDown = FastShutdown; + ereport(LOG, (errmsg("received fast shutdown request"))); + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); #ifdef USE_SYSTEMD - sd_notify(0, "STOPPING=1"); + sd_notify(0, "STOPPING=1"); #endif + } if (StartupPID != 0) signal_child(StartupPID, SIGTERM); @@ -2903,9 +2983,9 @@ pmdie(SIGNAL_ARGS) * terminate remaining ones with SIGKILL, then exit without * attempt to properly shut down the data base system. */ - if (Shutdown >= ImmediateShutdown) + if (StepDown >= ImmediateShutdown) break; - Shutdown = ImmediateShutdown; + StepDown = ImmediateShutdown; ereport(LOG, (errmsg("received immediate shutdown request"))); @@ -2929,6 +3009,7 @@ pmdie(SIGNAL_ARGS) break; } +out: #ifdef WIN32 PG_SETMASK(&UnBlockSig); #endif @@ -2967,10 +3048,11 @@ reaper(SIGNAL_ARGS) StartupPID = 0; /* - * Startup process exited in response to a shutdown request (or it - * completed normally regardless of the shutdown request). + * Startup process exited in response to a shutdown or demote + * request (or it completed normally regardless of the shutdown + * request). */ - if (Shutdown > NoShutdown && + if (StepDown > NoShutdown && (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) { StartupStatus = STARTUP_NOT_RUNNING; @@ -2984,7 +3066,7 @@ reaper(SIGNAL_ARGS) ereport(LOG, (errmsg("shutdown at recovery target"))); StartupStatus = STARTUP_NOT_RUNNING; - Shutdown = SmartShutdown; + StepDown = SmartShutdown; TerminateChildren(SIGTERM); pmState = PM_WAIT_BACKENDS; /* PostmasterStateMachine logic does the rest */ @@ -3124,7 +3206,7 @@ reaper(SIGNAL_ARGS) * archive cycle and quit. Likewise, if we have walsender * processes, tell them to send any remaining WAL and quit. */ - Assert(Shutdown > NoShutdown); + Assert(StepDown > NoShutdown); /* Waken archiver for the last time */ if (PgArchPID != 0) @@ -3484,7 +3566,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) * signaled children, nonzero exit status is to be expected, so don't * clutter log. */ - take_action = !FatalError && Shutdown != ImmediateShutdown; + take_action = !FatalError && StepDown != ImmediateShutdown; if (take_action) { @@ -3702,7 +3784,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) /* We do NOT restart the syslogger */ - if (Shutdown != ImmediateShutdown) + if (StepDown != ImmediateShutdown) FatalError = true; /* We now transit into a state of waiting for children to die */ @@ -3845,11 +3927,11 @@ PostmasterStateMachine(void) WalReceiverPID == 0 && BgWriterPID == 0 && (CheckpointerPID == 0 || - (!FatalError && Shutdown < ImmediateShutdown)) && + (!FatalError && StepDown < ImmediateShutdown)) && WalWriterPID == 0 && AutoVacPID == 0) { - if (Shutdown >= ImmediateShutdown || FatalError) + if (StepDown >= ImmediateShutdown || FatalError) { /* * Start waiting for dead_end children to die. This state @@ -3863,6 +3945,15 @@ PostmasterStateMachine(void) * FatalError state. */ } + /* Handle demote signal */ + else if (DemoteSignal) + { + ereport(LOG, (errmsg("all backend processes terminated; demoting"))); + + SendProcSignal(CheckpointerPID, PROCSIG_CHECKPOINTER_DEMOTING, InvalidBackendId); + pmState = PM_STARTUP; + StepDown = NoShutdown; + } else { /* @@ -3870,7 +3961,7 @@ PostmasterStateMachine(void) * the regular children are gone, and it's time to tell the * checkpointer to do a shutdown checkpoint. */ - Assert(Shutdown > NoShutdown); + Assert(StepDown > NoShutdown); /* Start the checkpointer if not running */ if (CheckpointerPID == 0) CheckpointerPID = StartCheckpointer(); @@ -3958,7 +4049,8 @@ PostmasterStateMachine(void) * EOF on its input pipe, which happens when there are no more upstream * processes. */ - if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN) + if (pmState == PM_NO_CHILDREN && (StepDown == SmartShutdown || + StepDown == FastShutdown || StepDown == ImmediateShutdown)) { if (FatalError) { @@ -3991,7 +4083,7 @@ PostmasterStateMachine(void) * startup process fails, because more than likely it will just fail again * and we will keep trying forever. */ - if (pmState == PM_NO_CHILDREN && + if (pmState == PM_NO_CHILDREN && !DemoteSignal && (StartupStatus == STARTUP_CRASHED || !restart_after_crash)) ExitPostmaster(1); @@ -5188,6 +5280,17 @@ sigusr1_handler(SIGNAL_ARGS) StartWorkerNeeded = true; } + /* Demoting: start the Startup Process */ + if (CheckPostmasterSignal(PMSIGNAL_DEMOTING) && + pmState == PM_STARTUP && StepDown == NoShutdown) + { + if (!XLogArchivingAlways()) + signal_child(PgArchPID, SIGQUIT); + StartupPID = StartupDataBase(); + Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; + } + /* * RECOVERY_STARTED and BEGIN_HOT_STANDBY signals are ignored in * unexpected states. If the startup process quickly starts up, completes @@ -5195,7 +5298,7 @@ sigusr1_handler(SIGNAL_ARGS) * first. We don't want to go back to recovery in that case. */ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) && - pmState == PM_STARTUP && Shutdown == NoShutdown) + pmState == PM_STARTUP && StepDown == NoShutdown) { /* WAL redo has started. We're out of reinitialization. */ FatalError = false; @@ -5205,17 +5308,27 @@ sigusr1_handler(SIGNAL_ARGS) * Crank up the background tasks. It doesn't matter if this fails, * we'll just try again later. */ - Assert(CheckpointerPID == 0); - CheckpointerPID = StartCheckpointer(); - Assert(BgWriterPID == 0); - BgWriterPID = StartBackgroundWriter(); + if (!DemoteSignal) + { + Assert(CheckpointerPID == 0); + Assert(BgWriterPID == 0); + Assert(PgArchPID == 0); + + CheckpointerPID = StartCheckpointer(); + } + else + { + Assert(CheckpointerPID); + } + + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); /* * Start the archiver if we're responsible for (re-)archiving received * files. */ - Assert(PgArchPID == 0); - if (XLogArchivingAlways()) + if (PgArchPID == 0 && XLogArchivingAlways()) PgArchPID = pgarch_start(); /* @@ -5226,6 +5339,7 @@ sigusr1_handler(SIGNAL_ARGS) if (!EnableHotStandby) { AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STANDBY); + DemoteSignal = false; #ifdef USE_SYSTEMD sd_notify(0, "READY=1"); #endif @@ -5234,13 +5348,15 @@ sigusr1_handler(SIGNAL_ARGS) pmState = PM_RECOVERY; } if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && - pmState == PM_RECOVERY && Shutdown == NoShutdown) + pmState == PM_RECOVERY && StepDown == NoShutdown) { /* * Likewise, start other special children as needed. */ - Assert(PgStatPID == 0); - PgStatPID = pgstat_start(); + if (!DemoteSignal) + Assert(PgStatPID == 0); + if(PgStatPID == 0) + PgStatPID = pgstat_start(); ereport(LOG, (errmsg("database system is ready to accept read only connections"))); @@ -5252,6 +5368,7 @@ sigusr1_handler(SIGNAL_ARGS) #endif pmState = PM_HOT_STANDBY; + DemoteSignal = false; /* Some workers may be scheduled to start now */ StartWorkerNeeded = true; } @@ -5284,7 +5401,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) && - Shutdown == NoShutdown) + StepDown == NoShutdown) { /* * Start one iteration of the autovacuum daemon, even if autovacuuming @@ -5299,7 +5416,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) && - Shutdown == NoShutdown) + StepDown == NoShutdown) { /* The autovacuum launcher wants us to start a worker process. */ StartAutovacuumWorker(); @@ -5644,7 +5761,7 @@ MaybeStartWalReceiver(void) if (WalReceiverPID == 0 && (pmState == PM_STARTUP || pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) && - Shutdown == NoShutdown) + StepDown == NoShutdown) { WalReceiverPID = StartWalReceiver(); if (WalReceiverPID != 0) @@ -6647,3 +6764,18 @@ InitPostmasterDeathWatchHandle(void) GetLastError()))); #endif /* WIN32 */ } + +/* + * Check if a promote request appeared. Should be called by postmaster before + * shutting down. + */ +bool +CheckDemoteSignal(void) +{ + struct stat stat_buf; + + if (stat(DEMOTE_SIGNAL_FILE, &stat_buf) == 0) + return true; + + return false; +} diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 4fa385b0ec..1903f4db2a 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -28,6 +28,7 @@ #include "storage/shmem.h" #include "storage/sinval.h" #include "tcop/tcopprot.h" +#include "postmaster/bgwriter.h" /* * The SIGUSR1 signal is multiplexed to support signaling multiple event @@ -585,6 +586,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN)) RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + if (CheckProcSignal(PROCSIG_CHECKPOINTER_DEMOTING)) + ReqCheckpointDemoteHandler(PROCSIG_CHECKPOINTER_DEMOTING); + SetLatch(MyLatch); latch_sigusr1_handler(); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index e73639df74..c144cc35d3 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -57,6 +57,8 @@ dbState(DBState state) return _("shut down"); case DB_SHUTDOWNED_IN_RECOVERY: return _("shut down in recovery"); + case DB_DEMOTING: + return _("demoting"); case DB_SHUTDOWNING: return _("shutting down"); case DB_IN_CRASH_RECOVERY: diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 3c03ace7ed..79bb42f7e7 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -62,6 +62,7 @@ typedef enum RESTART_COMMAND, RELOAD_COMMAND, STATUS_COMMAND, + DEMOTE_COMMAND, PROMOTE_COMMAND, LOGROTATE_COMMAND, KILL_COMMAND, @@ -103,6 +104,7 @@ static char version_file[MAXPGPATH]; static char pid_file[MAXPGPATH]; static char backup_file[MAXPGPATH]; static char promote_file[MAXPGPATH]; +static char demote_file[MAXPGPATH]; static char logrotate_file[MAXPGPATH]; static volatile pgpid_t postmasterPID = -1; @@ -129,6 +131,7 @@ static void do_stop(void); static void do_restart(void); static void do_reload(void); static void do_status(void); +static void do_demote(void); static void do_promote(void); static void do_logrotate(void); static void do_kill(pgpid_t pid); @@ -1029,6 +1032,109 @@ do_stop(void) } +static void +do_demote(void) +{ + int cnt; + FILE *dmtfile; + pgpid_t pid; + struct stat statbuf; + + pid = get_pgpid(false); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file); + write_stderr(_("Is server running?\n")); + exit(1); + } + else if (pid < 0) /* standalone backend, not postmaster */ + { + pid = -pid; + write_stderr(_("%s: cannot demote server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + exit(1); + } + if (shutdown_mode == IMMEDIATE_MODE) + { + write_stderr(_("%s: cannot demote server using immediate mode"), + progname); + exit(1); + } + + snprintf(demote_file, MAXPGPATH, "%s/demote", pg_data); + + if ((dmtfile = fopen(demote_file, "w")) == NULL) + { + write_stderr(_("%s: could not create demote signal file \"%s\": %s\n"), + progname, demote_file, strerror(errno)); + exit(1); + } + if (fclose(dmtfile)) + { + write_stderr(_("%s: could not write demote signal file \"%s\": %s\n"), + progname, demote_file, strerror(errno)); + exit(1); + } + + if (kill((pid_t) pid, sig) != 0) + { + write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } + + if (!do_wait) + { + print_msg(_("server demoting\n")); + return; + } + else + { + /* + * If backup_label exists, an online backup is running. Warn the user + * that smart demote will wait for it to finish. However, if the + * server is in archive recovery, we're recovering from an online + * backup instead of performing one. + */ + if (shutdown_mode == SMART_MODE && + stat(backup_file, &statbuf) == 0 && + get_control_dbstate() != DB_IN_ARCHIVE_RECOVERY) + { + print_msg(_("WARNING: online backup mode is active\n" + "Demote will not complete until pg_stop_backup() is called.\n\n")); + } + + print_msg(_("waiting for server to demote...")); + + for (cnt = 0; cnt < wait_seconds * WAITS_PER_SEC; cnt++) + { + if (get_control_dbstate() == DB_IN_ARCHIVE_RECOVERY) + break; + + if (cnt % WAITS_PER_SEC == 0) + print_msg("."); + pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); + } + + if (get_control_dbstate() != DB_IN_ARCHIVE_RECOVERY) + { + print_msg(_(" failed\n")); + + write_stderr(_("%s: server does not demote\n"), progname); + if (shutdown_mode == SMART_MODE) + write_stderr(_("HINT: The \"-m fast\" option immediately disconnects sessions rather than\n" + "waiting for session-initiated disconnection.\n")); + exit(1); + } + print_msg(_(" done\n")); + + print_msg(_("server demoted\n")); + } +} + + /* * restart/reload routines */ @@ -2452,6 +2558,8 @@ main(int argc, char **argv) ctl_command = RELOAD_COMMAND; else if (strcmp(argv[optind], "status") == 0) ctl_command = STATUS_COMMAND; + else if (strcmp(argv[optind], "demote") == 0) + ctl_command = DEMOTE_COMMAND; else if (strcmp(argv[optind], "promote") == 0) ctl_command = PROMOTE_COMMAND; else if (strcmp(argv[optind], "logrotate") == 0) @@ -2559,6 +2667,9 @@ main(int argc, char **argv) case RELOAD_COMMAND: do_reload(); break; + case DEMOTE_COMMAND: + do_demote(); + break; case PROMOTE_COMMAND: do_promote(); break; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 77ac4e785f..ff0119046e 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -219,18 +219,20 @@ extern bool XLOG_DEBUG; /* These directly affect the behavior of CreateCheckPoint and subsidiaries */ #define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */ -#define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but +#define CHECKPOINT_IS_DEMOTE 0x0002 /* Like shutdown checkpoint, but + * issued at end of WAL production */ +#define CHECKPOINT_END_OF_RECOVERY 0x0004 /* Like shutdown checkpoint, but * issued at end of WAL recovery */ -#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */ -#define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */ -#define CHECKPOINT_FLUSH_ALL 0x0010 /* Flush all pages, including those +#define CHECKPOINT_IMMEDIATE 0x0008 /* Do it without delays */ +#define CHECKPOINT_FORCE 0x0010 /* Force even if no activity */ +#define CHECKPOINT_FLUSH_ALL 0x0020 /* Flush all pages, including those * belonging to unlogged tables */ /* These are important to RequestCheckpoint */ -#define CHECKPOINT_WAIT 0x0020 /* Wait for completion */ -#define CHECKPOINT_REQUESTED 0x0040 /* Checkpoint request has been made */ +#define CHECKPOINT_WAIT 0x0040 /* Wait for completion */ +#define CHECKPOINT_REQUESTED 0x0080 /* Checkpoint request has been made */ /* These indicate the cause of a checkpoint request */ -#define CHECKPOINT_CAUSE_XLOG 0x0080 /* XLOG consumption */ -#define CHECKPOINT_CAUSE_TIME 0x0100 /* Elapsed time */ +#define CHECKPOINT_CAUSE_XLOG 0x0100 /* XLOG consumption */ +#define CHECKPOINT_CAUSE_TIME 0x0200 /* Elapsed time */ /* * Flag bits for the record being inserted, set using XLogSetRecordFlags(). diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index de5670e538..b38671ae52 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -76,6 +76,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_CHECKPOINT_DEMOTE 0xC0 /* @@ -87,6 +88,7 @@ typedef enum DBState DB_STARTUP = 0, DB_SHUTDOWNED, DB_SHUTDOWNED_IN_RECOVERY, + DB_DEMOTING, DB_SHUTDOWNING, DB_IN_CRASH_RECOVERY, DB_IN_ARCHIVE_RECOVERY, diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h index 179ebaa104..a9e27f009e 100644 --- a/src/include/libpq/libpq-be.h +++ b/src/include/libpq/libpq-be.h @@ -70,7 +70,12 @@ typedef struct typedef enum CAC_state { - CAC_OK, CAC_STARTUP, CAC_SHUTDOWN, CAC_RECOVERY, CAC_TOOMANY, + CAC_OK, + CAC_STARTUP, + CAC_DEMOTE, + CAC_SHUTDOWN, + CAC_RECOVERY, + CAC_TOOMANY, CAC_WAITBACKUP } CAC_state; diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 0a5708b32e..4d4f0ea1dd 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -41,5 +41,6 @@ extern Size CheckpointerShmemSize(void); extern void CheckpointerShmemInit(void); extern bool FirstCallSinceLastCheckpoint(void); +extern void ReqCheckpointDemoteHandler(SIGNAL_ARGS); #endif /* _BGWRITER_H */ diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 56c5ec4481..1c5baf3b68 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -41,6 +41,7 @@ typedef enum PMSIGNAL_BACKGROUND_WORKER_CHANGE, /* background worker state change */ PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ + PMSIGNAL_DEMOTING, /* restart startup process */ NUM_PMSIGNALS /* Must be last value of enum! */ } PMSignalReason; diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 5cb39697f3..eb0bda04f5 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -34,6 +34,7 @@ typedef enum PROCSIG_PARALLEL_MESSAGE, /* message from cooperating parallel backend */ PROCSIG_WALSND_INIT_STOPPING, /* ask walsenders to prepare for shutdown */ PROCSIG_BARRIER, /* global barrier interrupt */ + PROCSIG_CHECKPOINTER_DEMOTING, /* ask checkpointer to demote */ /* Recovery conflict reasons */ PROCSIG_RECOVERY_CONFLICT_DATABASE, diff --git a/src/include/utils/pidfile.h b/src/include/utils/pidfile.h index 63fefe5c4c..f761d2c4ef 100644 --- a/src/include/utils/pidfile.h +++ b/src/include/utils/pidfile.h @@ -50,6 +50,7 @@ */ #define PM_STATUS_STARTING "starting" /* still starting up */ #define PM_STATUS_STOPPING "stopping" /* in shutdown sequence */ +#define PM_STATUS_DEMOTING "demoting" /* demote sequence */ #define PM_STATUS_READY "ready " /* ready for connections */ #define PM_STATUS_STANDBY "standby " /* up, won't accept connections */ -- 2.20.1