On 2020/03/04 23:40, Jehan-Guillaume de Rorthais wrote:
On Wed, 04 Mar 2020 15:00:54 +0300
Sergei Kornilov <s...@zsrv.org> wrote:
Hello
I want to start this discussion because this is related to the patch
(propoesd at the thread [1]) that I'm reviewing. It does that partially,
i.e., prefers the promotion only when the pause is requested by
recovery_target_action=pause. But I think that it's reasonable and
more consistent to do that whether whichever the pause is requested
by pg_wal_replay_pause() or recovery_target_action.
+1.
+1
And pg_wal_replay_pause () should probably raise an error explaining the
standby ignores the pause because of ongoing promotion.
OK, so patch attached.
This patch causes, if a promotion is triggered while recovery is paused,
the paused state to end and a promotion to continue. OTOH, this patch
makes pg_wal_replay_pause() and _resume() throw an error if it's executed
while a promotion is ongoing.
Regarding recovery_target_action, if the recovery target is reached
while a promotion is ongoing, "pause" setting will act the same as "promote",
i.e., recovery will finish and the server will start to accept connections.
To implement the above, I added new shared varible indicating whether
a promotion is triggered or not. Only startup process can update this shared
variable. Other processes like read-only backends can check whether
promotion is ongoing, via this variable.
I added new function PromoteIsTriggered() that returns true if a promotion
is triggered. Since the name of this function and the existing function
IsPromoteTriggered() are confusingly similar, I changed the name of
IsPromoteTriggered() to IsPromoteSignaled, as more appropriate name.
I'd like to apply the change of log message that Sergei proposed at [1]
after commiting this patch if it's ok.
[1]
https://www.postgresql.org/message-id/flat/19168211580382...@myt5-b646bde4b8f3.qloud-c.yandex.net
Regards,
--
Fujii Masao
NTT DATA CORPORATION
Advanced Platform Technology Group
Research and Development Headquarters
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index c1128f89ec..86726376f1 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3570,6 +3570,9 @@ restore_command = 'copy "C:\\server\\archivedir\\%f"
"%p"' # Windows
This setting has no effect if no recovery target is set.
If <xref linkend="guc-hot-standby"/> is not enabled, a setting of
<literal>pause</literal> will act the same as
<literal>shutdown</literal>.
+ If the recovery target is reached while a promotion is ongoing,
+ a setting of <literal>pause</literal> will act the same as
+ <literal>promote</literal>.
</para>
<para>
In any case, if a recovery target is configured but the archive
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 323366feb6..bcd456f52d 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -20154,6 +20154,13 @@ postgres=# SELECT * FROM
pg_walfile_name_offset(pg_stop_backup());
recovery is resumed.
</para>
+ <para>
+ <function>pg_wal_replay_pause</function> and
+ <function>pg_wal_replay_resume</function> cannot be executed while
+ a promotion is ongoing. If a promotion is triggered while recovery
+ is paused, the paused state ends and a promotion continues.
+ </para>
+
<para>
If streaming replication is disabled, the paused state may continue
indefinitely without problem. While streaming replication is in
diff --git a/src/backend/access/transam/xlog.c
b/src/backend/access/transam/xlog.c
index 4361568882..c545aeffa3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -229,6 +229,12 @@ static bool LocalRecoveryInProgress = true;
*/
static bool LocalHotStandbyActive = false;
+/*
+ * Local copy of SharedPromoteIsTriggered variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalPromoteIsTriggered = false;
+
/*
* Local state for XLogInsertAllowed():
* 1: unconditionally allowed to insert XLOG
@@ -654,6 +660,12 @@ typedef struct XLogCtlData
*/
bool SharedHotStandbyActive;
+ /*
+ * SharedPromoteIsTriggered indicates if a standby promotion has been
+ * triggered. Protected by info_lck.
+ */
+ bool SharedPromoteIsTriggered;
+
/*
* WalWriterSleeping indicates whether the WAL writer is currently in
* low-power mode (and hence should be nudged if an async commit
occurs).
@@ -912,6 +924,7 @@ static void InitControlFile(uint64 sysidentifier);
static void WriteControlFile(void);
static void ReadControlFile(void);
static char *str_time(pg_time_t tnow);
+static void SetPromoteIsTriggered(void);
static bool CheckForStandbyTrigger(void);
#ifdef WAL_DEBUG
@@ -5112,6 +5125,7 @@ XLOGShmemInit(void)
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->SharedRecoveryInProgress = true;
XLogCtl->SharedHotStandbyActive = false;
+ XLogCtl->SharedPromoteIsTriggered = false;
XLogCtl->WalWriterSleeping = false;
SpinLockInit(&XLogCtl->Insert.insertpos_lck);
@@ -5940,14 +5954,20 @@ recoveryPausesHere(void)
if (!LocalHotStandbyActive)
return;
+ /* Don't pause after standby promotion has been triggered */
+ if (LocalPromoteIsTriggered)
+ return;
+
ereport(LOG,
(errmsg("recovery has paused"),
errhint("Execute pg_wal_replay_resume() to
continue.")));
while (RecoveryIsPaused())
{
+ HandleStartupProcInterrupts();
+ if (CheckForStandbyTrigger())
+ return;
pg_usleep(1000000L); /* 1000 ms */
- HandleStartupProcInterrupts();
}
}
@@ -12252,6 +12272,40 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
return emode;
}
+/*
+ * Has a standby promotion already been triggered?
+ *
+ * Unlike CheckForStandbyTrigger(), this works in any process
+ * that's connected to shared memory.
+ */
+bool
+PromoteIsTriggered(void)
+{
+ /*
+ * We check shared state each time only until a standby promotion is
+ * triggered. We can't trigger a promotion again, so there's no need to
+ * keep checking after the shared variable has once been seen true.
+ */
+ if (LocalPromoteIsTriggered)
+ return true;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ return LocalPromoteIsTriggered;
+}
+
+static void
+SetPromoteIsTriggered(void)
+{
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->SharedPromoteIsTriggered = true;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ LocalPromoteIsTriggered = true;
+}
+
/*
* Check to see whether the user-specified trigger file exists and whether a
* promote request has arrived. If either condition holds, return true.
@@ -12260,12 +12314,11 @@ static bool
CheckForStandbyTrigger(void)
{
struct stat stat_buf;
- static bool triggered = false;
- if (triggered)
+ if (LocalPromoteIsTriggered)
return true;
- if (IsPromoteTriggered())
+ if (IsPromoteSignaled())
{
/*
* In 9.1 and 9.2 the postmaster unlinked the promote file
inside the
@@ -12288,8 +12341,8 @@ CheckForStandbyTrigger(void)
ereport(LOG, (errmsg("received promote request")));
- ResetPromoteTriggered();
- triggered = true;
+ ResetPromoteSignaled();
+ SetPromoteIsTriggered();
return true;
}
@@ -12301,7 +12354,7 @@ CheckForStandbyTrigger(void)
ereport(LOG,
(errmsg("promote trigger file found: %s",
PromoteTriggerFile)));
unlink(PromoteTriggerFile);
- triggered = true;
+ SetPromoteIsTriggered();
fast_promote = true;
return true;
}
diff --git a/src/backend/access/transam/xlogfuncs.c
b/src/backend/access/transam/xlogfuncs.c
index 20316539b6..b84ba57259 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -531,6 +531,13 @@ pg_wal_replay_pause(PG_FUNCTION_ARGS)
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only
be executed during recovery.")));
+ if (PromoteIsTriggered())
+ ereport(ERROR,
+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("standby promotion is ongoing"),
+ errhint("%s cannot be executed after promotion
is triggered.",
+ "pg_wal_replay_pause()")));
+
SetRecoveryPause(true);
PG_RETURN_VOID();
@@ -551,6 +558,13 @@ pg_wal_replay_resume(PG_FUNCTION_ARGS)
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only
be executed during recovery.")));
+ if (PromoteIsTriggered())
+ ereport(ERROR,
+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("standby promotion is ongoing"),
+ errhint("%s cannot be executed after promotion
is triggered.",
+ "pg_wal_replay_resume()")));
+
SetRecoveryPause(false);
PG_RETURN_VOID();
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index c2250d7d4e..8952676765 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -39,7 +39,7 @@
*/
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t shutdown_requested = false;
-static volatile sig_atomic_t promote_triggered = false;
+static volatile sig_atomic_t promote_signaled = false;
/*
* Flag set when executing a restore command, to tell SIGTERM signal handler
@@ -63,7 +63,7 @@ StartupProcTriggerHandler(SIGNAL_ARGS)
{
int save_errno = errno;
- promote_triggered = true;
+ promote_signaled = true;
WakeupRecovery();
errno = save_errno;
@@ -197,13 +197,13 @@ PostRestoreCommand(void)
}
bool
-IsPromoteTriggered(void)
+IsPromoteSignaled(void)
{
- return promote_triggered;
+ return promote_signaled;
}
void
-ResetPromoteTriggered(void)
+ResetPromoteSignaled(void)
{
- promote_triggered = false;
+ promote_signaled = false;
}
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 98b033fc20..331497bcfb 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -313,6 +313,7 @@ extern XLogRecPtr GetFlushRecPtr(void);
extern XLogRecPtr GetLastImportantRecPtr(void);
extern void RemovePromoteSignalFiles(void);
+extern bool PromoteIsTriggered(void);
extern bool CheckPromoteSignal(void);
extern void WakeupRecovery(void);
extern void SetWalWriterSleeping(bool sleeping);
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index 9f59c1ffa3..bec313764a 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -16,7 +16,7 @@ extern void HandleStartupProcInterrupts(void);
extern void StartupProcessMain(void) pg_attribute_noreturn();
extern void PreRestoreCommand(void);
extern void PostRestoreCommand(void);
-extern bool IsPromoteTriggered(void);
-extern void ResetPromoteTriggered(void);
+extern bool IsPromoteSignaled(void);
+extern void ResetPromoteSignaled(void);
#endif /* _STARTUP_H */