Intro
==========
The main purpose of the feature is to achieve
read-your-writes-consistency, while using async replica for reads and
primary for writes. In that case lsn of last modification is stored
inside application. We cannot store this lsn inside database, since
reads are distributed across all replicas and primary.
Procedure style implementation
==========
https://www.postgresql.org/message-id/27171.1586439221%40sss.pgh.pa.us
https://www.postgresql.org/message-id/20210121.173009.235021120161403875.horikyota.ntt%40gmail.com
CALL pg_wait_lsn(‘LSN’, timeout);
Examples
==========
primary standby
------- --------
postgresql.conf
recovery_min_apply_delay = 10s
CREATE TABLE tbl AS SELECT generate_series(1,10) AS a;
INSERT INTO tbl VALUES (generate_series(11, 20));
SELECT pg_current_wal_lsn();
CALL pg_wait_lsn('0/3002AE8', 10000);
BEGIN;
SELECT * FROM tbl; // read fresh insertions
COMMIT;
Fixed and ready to review.
--
Ivan Kartyshov
Postgres Professional: www.postgrespro.com
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 5b225ccf4f..cb1c175c62 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -27861,6 +27861,19 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
extension.
</para></entry>
</row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>pg_wait_lsn</primary>
+ </indexterm>
+ <function>pg_wait_lsn</function> (wait_lsn pg_lsn, timeout bigint)
+ </para>
+ <para>
+ If <parameter>timeout</parameter> <= 0 then timeout is off.
+ Returns ERROR if target <parameter>wait_lsn</parameter> was not replayed.
+ </para></entry>
+ </row>
</tbody>
</tgroup>
</table>
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 29c5bec084..0b783dc733 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -43,6 +43,7 @@
#include "backup/basebackup.h"
#include "catalog/pg_control.h"
#include "commands/tablespace.h"
+#include "commands/waitlsn.h"
#include "common/file_utils.h"
#include "miscadmin.h"
#include "pgstat.h"
@@ -1828,6 +1829,14 @@ PerformWalRecovery(void)
break;
}
+ /*
+ * If we replayed an LSN that someone was waiting for, set latches
+ * in shared memory array to notify the waiter.
+ */
+ if (waitLSN &&
+ (XLogRecoveryCtl->lastReplayedEndRecPtr >= pg_atomic_read_u64(&waitLSN->minLSN)))
+ WaitLSNSetLatches(XLogRecoveryCtl->lastReplayedEndRecPtr);
+
/* Else, try to fetch the next WAL record */
record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
} while (record != NULL);
diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile
index 48f7348f91..cede90c3b9 100644
--- a/src/backend/commands/Makefile
+++ b/src/backend/commands/Makefile
@@ -61,6 +61,7 @@ OBJS = \
vacuum.o \
vacuumparallel.o \
variable.o \
- view.o
+ view.o \
+ waitlsn.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/commands/meson.build b/src/backend/commands/meson.build
index 6dd00a4abd..7549be5dc3 100644
--- a/src/backend/commands/meson.build
+++ b/src/backend/commands/meson.build
@@ -50,4 +50,5 @@ backend_sources += files(
'vacuumparallel.c',
'variable.c',
'view.c',
+ 'waitlsn.c',
)
diff --git a/src/backend/commands/waitlsn.c b/src/backend/commands/waitlsn.c
new file mode 100644
index 0000000000..f9b701c946
--- /dev/null
+++ b/src/backend/commands/waitlsn.c
@@ -0,0 +1,309 @@
+/*-------------------------------------------------------------------------
+ *
+ * waitlsn.c
+ * Implements waiting for the given LSN, which is used in
+ * CALL pg_wait_lsn(wait_lsn pg_lsn, timeout int).
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/commands/waitlsn.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <float.h>
+#include <math.h>
+
+#include "pgstat.h"
+#include "fmgr.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "access/xlogrecovery.h"
+#include "catalog/pg_type.h"
+#include "commands/waitlsn.h"
+#include "executor/spi.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "storage/sinvaladt.h"
+#include "utils/builtins.h"
+#include "utils/pg_lsn.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+#include "utils/fmgrprotos.h"
+
+/* Add to / delete from shared memory array */
+static void addLSNWaiter(XLogRecPtr lsn);
+static void deleteLSNWaiter(void);
+
+struct WaitLSNState *waitLSN = NULL;
+static volatile sig_atomic_t haveShmemItem = false;
+
+/*
+ * Report the amount of shared memory space needed for WaitLSNState
+ */
+Size
+WaitLSNShmemSize(void)
+{
+ Size size;
+
+ size = offsetof(WaitLSNState, procInfos);
+ size = add_size(size, mul_size(MaxBackends, sizeof(WaitLSNProcInfo)));
+ return size;
+}
+
+/* Initialize the WaitLSNState in the shared memory */
+void
+WaitLSNShmemInit(void)
+{
+ bool found;
+
+ waitLSN = (WaitLSNState *) ShmemInitStruct("WaitLSNState",
+ WaitLSNShmemSize(),
+ &found);
+ if (!found)
+ {
+ SpinLockInit(&waitLSN->mutex);
+ waitLSN->numWaitedProcs = 0;
+ pg_atomic_init_u64(&waitLSN->minLSN, PG_UINT64_MAX);
+ }
+}
+
+/*
+ * Add the information about the LSN waiter backend to the shared memory
+ * array.
+ */
+static void
+addLSNWaiter(XLogRecPtr lsn)
+{
+ WaitLSNProcInfo cur;
+ int i;
+
+ SpinLockAcquire(&waitLSN->mutex);
+
+ cur.procnum = MyProcNumber;
+ cur.waitLSN = lsn;
+
+ for (i = 0; i < waitLSN->numWaitedProcs; i++)
+ {
+ if (waitLSN->procInfos[i].waitLSN >= cur.waitLSN)
+ {
+ WaitLSNProcInfo tmp;
+
+ tmp = waitLSN->procInfos[i];
+ waitLSN->procInfos[i] = cur;
+ cur = tmp;
+ }
+ }
+ waitLSN->procInfos[i] = cur;
+ waitLSN->numWaitedProcs++;
+
+ pg_atomic_write_u64(&waitLSN->minLSN, waitLSN->procInfos[i].waitLSN);
+ SpinLockRelease(&waitLSN->mutex);
+}
+
+/*
+ * Delete the information about the LSN waiter backend from the shared memory
+ * array.
+ */
+static void
+deleteLSNWaiter(void)
+{
+ int i;
+ bool found = false;
+
+ SpinLockAcquire(&waitLSN->mutex);
+
+ for (i = 0; i < waitLSN->numWaitedProcs; i++)
+ {
+ if (waitLSN->procInfos[i].procnum == MyProcNumber)
+ found = true;
+
+ if (found && i < waitLSN->numWaitedProcs - 1)
+ {
+ waitLSN->procInfos[i] = waitLSN->procInfos[i + 1];
+ }
+ }
+
+ if (!found)
+ {
+ SpinLockRelease(&waitLSN->mutex);
+ return;
+ }
+ waitLSN->numWaitedProcs--;
+
+ if (waitLSN->numWaitedProcs != 0)
+ pg_atomic_write_u64(&waitLSN->minLSN, waitLSN->procInfos[i].waitLSN);
+ else
+ pg_atomic_write_u64(&waitLSN->minLSN, PG_UINT64_MAX);
+
+ SpinLockRelease(&waitLSN->mutex);
+}
+
+/*
+ * Set all latches in shared memory to signal that new LSN has been replayed
+*/
+void
+WaitLSNSetLatches(XLogRecPtr curLSN)
+{
+ uint32 i,
+ numWakeUpProcs;
+
+ SpinLockAcquire(&waitLSN->mutex);
+
+ /*
+ * Set latches for process, whose waited LSNs are already replayed.
+ */
+ for (i = 0; i < waitLSN->numWaitedProcs; i++)
+ {
+ PGPROC *backend;
+
+ if (waitLSN->procInfos[i].waitLSN > curLSN)
+ break;
+
+ backend = GetPGProcByNumber(waitLSN->procInfos[i].procnum);
+ SetLatch(&backend->procLatch);
+ }
+
+ /*
+ * Immediately remove those processes from the shmem array. Otherwise,
+ * shmem array items will be here till corresponding processes wake up and
+ * delete themselves.
+ */
+ numWakeUpProcs = i;
+ for (i = 0; i < waitLSN->numWaitedProcs - numWakeUpProcs; i++)
+ waitLSN->procInfos[i] = waitLSN->procInfos[i + numWakeUpProcs];
+ waitLSN->numWaitedProcs -= numWakeUpProcs;
+
+ if (waitLSN->numWaitedProcs != 0)
+ pg_atomic_write_u64(&waitLSN->minLSN, waitLSN->procInfos[i].waitLSN);
+ else
+ pg_atomic_write_u64(&waitLSN->minLSN, PG_UINT64_MAX);
+
+ SpinLockRelease(&waitLSN->mutex);
+}
+
+/*
+ * Delete our item from shmem array if any.
+ */
+void
+WaitLSNCleanup(void)
+{
+ if (haveShmemItem)
+ deleteLSNWaiter();
+}
+
+/*
+ * Wait using MyLatch to wait till the given LSN is replayed, the postmaster dies or
+ * timeout happens.
+ */
+void
+WaitForLSN(XLogRecPtr lsn, uint64 millisecs)
+{
+ XLogRecPtr curLSN;
+ int latch_events;
+ TimestampTz endtime;
+
+ /* Shouldn't be called when shmem isn't initialized */
+ Assert(waitLSN);
+
+ /* Should be only called by a backend */
+ Assert(MyBackendType == B_BACKEND);
+
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Waiting for LSN can only be executed during recovery.")));
+
+ if (millisecs > 86400000)
+ {
+ millisecs = 86400000
+ elog(WARNING,"Timeout for pg_wait_lsn() restricted to 1 day");
+ }
+
+ endtime = GetCurrentTimestamp() + millisecs * 1000;
+
+ latch_events = WL_TIMEOUT | WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
+ addLSNWaiter(lsn);
+ haveShmemItem = true;
+
+ for (;;)
+ {
+ int rc;
+ long delay_ms;
+
+ /* Check if the waited LSN has been replayed */
+ curLSN = GetXLogReplayRecPtr(NULL);
+ if (lsn <= curLSN)
+ break;
+
+ if (millisecs > 0)
+ delay_ms = (endtime - GetCurrentTimestamp()) / 1000;
+ else
+ /* If no timeout is set then wake up in 1 minute for interrupts */
+ delay_ms = 60000;
+
+ if (delay_ms <= 0)
+ break;
+
+ /*
+ * If received an interruption from CHECK_FOR_INTERRUPTS, then delete
+ * the current event from array.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ /* If postmaster dies, finish immediately */
+ if (!PostmasterIsAlive())
+ break;
+
+ rc = WaitLatch(MyLatch, latch_events, delay_ms,
+ WAIT_EVENT_WAIT_FOR_STANDBY_CONFIRMATION);
+
+ if (rc & WL_LATCH_SET)
+ ResetLatch(MyLatch);
+ }
+
+ if (lsn > curLSN)
+ {
+ deleteLSNWaiter();
+ haveShmemItem = false;
+ ereport(ERROR,
+ (errcode(ERRCODE_QUERY_CANCELED),
+ errmsg("canceling waiting for LSN due to timeout")));
+ }
+ else
+ {
+ haveShmemItem = false;
+ }
+}
+
+Datum
+pg_wait_lsn(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr trg_lsn = PG_GETARG_LSN(0);
+ uint64_t delay = PG_GETARG_INT32(1);
+ CallContext *context = (CallContext *) fcinfo->context;
+
+ if (context->atomic)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("pg_wait_lsn() must be only called in non-atomic context")));
+
+ if (ActiveSnapshotSet())
+ PopActiveSnapshot();
+ Assert(!ActiveSnapshotSet());
+
+ (void) WaitForLSN(trg_lsn, delay);
+
+ PG_RETURN_VOID();
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 521ed5418c..5aed90c935 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -25,6 +25,7 @@
#include "access/xlogprefetcher.h"
#include "access/xlogrecovery.h"
#include "commands/async.h"
+#include "commands/waitlsn.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
@@ -152,6 +153,7 @@ CalculateShmemSize(int *num_semaphores)
size = add_size(size, WaitEventExtensionShmemSize());
size = add_size(size, InjectionPointShmemSize());
size = add_size(size, SlotSyncShmemSize());
+ size = add_size(size, WaitLSNShmemSize());
#ifdef EXEC_BACKEND
size = add_size(size, ShmemBackendArraySize());
#endif
@@ -244,6 +246,11 @@ CreateSharedMemoryAndSemaphores(void)
/* Initialize subsystems */
CreateOrAttachShmemStructs();
+ /*
+ * Init array of Latches in shared memory for wait lsn
+ */
+ WaitLSNShmemInit();
+
#ifdef EXEC_BACKEND
/*
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 162b1f919d..4b830dc3c8 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -36,6 +36,7 @@
#include "access/transam.h"
#include "access/twophase.h"
#include "access/xlogutils.h"
+#include "commands/waitlsn.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
@@ -862,6 +863,11 @@ ProcKill(int code, Datum arg)
*/
LWLockReleaseAll();
+ /*
+ * Cleanup waiting for LSN if any.
+ */
+ WaitLSNCleanup();
+
/* Cancel any pending condition variable sleep, too */
ConditionVariableCancelSleep();
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 177d81a891..21222bf18c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12135,6 +12135,11 @@
prorettype => 'bytea', proargtypes => 'pg_brin_minmax_multi_summary',
prosrc => 'brin_minmax_multi_summary_send' },
+{ oid => '16387', descr => 'wait for LSN until timeout',
+ proname => 'pg_wait_lsn', prokind => 'p', prorettype => 'void',
+ proargtypes => 'pg_lsn int8', proargnames => '{trg_lsn,delay}',
+ prosrc => 'pg_wait_lsn' },
+
{ oid => '6291', descr => 'arbitrary value from among input values',
proname => 'any_value', prokind => 'a', proisstrict => 'f',
prorettype => 'anyelement', proargtypes => 'anyelement',
diff --git a/src/include/commands/waitlsn.h b/src/include/commands/waitlsn.h
new file mode 100644
index 0000000000..192e434d87
--- /dev/null
+++ b/src/include/commands/waitlsn.h
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * waitlsn.h
+ * Declarations for LSN waiting routines.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * src/include/commands/waitlsn.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef WAIT_LSN_H
+#define WAIT_LSN_H
+
+#include "postgres.h"
+#include "port/atomics.h"
+#include "storage/spin.h"
+#include "tcop/dest.h"
+
+/* Shared memory structure */
+typedef struct WaitLSNProcInfo
+{
+ int procnum;
+ XLogRecPtr waitLSN;
+} WaitLSNProcInfo;
+
+typedef struct WaitLSNState
+{
+ pg_atomic_uint64 minLSN;
+ slock_t mutex;
+ int numWaitedProcs;
+ WaitLSNProcInfo procInfos[FLEXIBLE_ARRAY_MEMBER];
+} WaitLSNState;
+
+extern PGDLLIMPORT struct WaitLSNState *waitLSN;
+
+extern void WaitForLSN(XLogRecPtr lsn, uint64 millisecs);
+extern Size WaitLSNShmemSize(void);
+extern void WaitLSNShmemInit(void);
+extern void WaitLSNSetLatches(XLogRecPtr curLSN);
+extern void WaitLSNCleanup(void);
+
+#endif /* WAIT_LSN_H */
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index b1eb77b1ec..bc47c93902 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -51,6 +51,7 @@ tests += {
't/040_standby_failover_slots_sync.pl',
't/041_checkpoint_at_promote.pl',
't/042_low_level_backup.pl',
+ 't/043_wait_lsn.pl',
],
},
}
diff --git a/src/test/recovery/t/043_wait_lsn.pl b/src/test/recovery/t/043_wait_lsn.pl
new file mode 100644
index 0000000000..04b45eefec
--- /dev/null
+++ b/src/test/recovery/t/043_wait_lsn.pl
@@ -0,0 +1,62 @@
+# Checks waiting for lsn on standby AFTER
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
+
+# And some content and take a backup
+$node_primary->safe_psql('postgres',
+ "CREATE TABLE wait_test AS SELECT generate_series(1,10) AS a");
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Create a streaming standby with a 1 second delay from the backup
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+my $delay = 1;
+$node_standby->init_from_backup($node_primary, $backup_name,
+ has_streaming => 1);
+$node_standby->append_conf(
+ 'postgresql.conf', qq[
+ recovery_min_apply_delay = '${delay}s'
+]);
+$node_standby->start;
+
+
+# Make sure that AFTER works: add new content to primary and memorize
+# primary's new LSN, then wait for primary's LSN on standby. Prove that AFTER is
+# able to setup an infinite waiting loop and exit it if given no wait timeout.
+$node_primary->safe_psql('postgres',
+ "INSERT INTO wait_test VALUES (generate_series(11, 20))");
+my $lsn1 =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+my $output = $node_standby->safe_psql(
+ 'postgres', qq[
+ CALL pg_wait_lsn('${lsn1}', 1000000);
+ SELECT pg_lsn_cmp(pg_last_wal_replay_lsn(), '${lsn1}'::pg_lsn);
+]);
+
+# Get the current LSN on standby and make sure it's the same as primary's LSN
+ok($output eq 0, "standby reached the same LSN as primary AFTER");
+
+my $lsn2 =
+ $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn() + 1");
+my $stderr;
+$node_standby->safe_psql('postgres', "CALL pg_wait_lsn('${lsn1}', 1);");
+$node_standby->psql(
+ 'postgres',
+ "CALL pg_wait_lsn('${lsn2}', 1);",
+ stderr => \$stderr);
+ok( $stderr =~ /canceling waiting for LSN due to timeout/,
+ "get timeout on waiting for unreachable LSN");
+
+
+$node_standby->stop;
+$node_primary->stop;
+done_testing();