> If a .ready file is created out of order, the directory scan logic
> will pick it up about as soon as possible based on its priority.  If
> the archiver is keeping up relatively well, there's a good chance such
> a file will have the highest archival priority and will be picked up
> the next time the archiver looks for a file to archive.  With the
> patch proposed in this thread, an out-of-order .ready file has no such
> guarantee.  As long as the archiver never has to fall back to a
> directory scan, it won't be archived.  The proposed patch handles the
> case where RemoveOldXlogFiles() creates missing .ready files by
> forcing a directory scan, but I'm not sure this is enough.  I think we
> have to check the archiver state each time we create a .ready file to
> see whether we're creating one out-of-order.

We can handle the scenario where .ready file is created out of order
in XLogArchiveNotify(). This way we can avoid making an explicit call
to enable directory scan from different code paths which may result
into creating an out of order .ready file.

Archiver can store the segment number corresponding to the last or most
recent .ready file found. When a .ready file is created in
XLogArchiveNotify(),
the log segment number of the current .ready file can be compared with the
segment number of the last .ready file found at archiver to detect if this
file is
created out of order. A directory scan can be forced if required.

I have incorporated these changes in patch v11.

> While this may be an extremely rare problem in practice, archiving
> something after the next checkpoint completes seems better than never
> archiving it at all.  IMO this isn't an area where there is much space
> to take risks.

An alternate approach could be to force a directory scan at checkpoint to
break the infinite wait for a .ready file which is being missed due to the
fact that it is created out of order. This will make sure that the file
gets archived within the checkpoint boundaries.

Thoughts?

Please find attached patch v11.

Thanks,
Dipesh
From 9392fd1b82ade933e8127845013bb2940239af68 Mon Sep 17 00:00:00 2001
From: Dipesh Pandit <dipesh.pan...@enterprisedb.com>
Date: Tue, 24 Aug 2021 12:17:34 +0530
Subject: [PATCH] mitigate directory scan for WAL archiver

WAL archiver scans the status directory to identify the next WAL
file that needs to be archived. This directory scan can be minimised
by maintaining the log segment of current wAL file which is being
archived and incrementing it by '1' to get the next WAL file.
Archiver can check the availability of next file and in case if the
file is not available then it should fall-back to directory scan to
get the oldest WAL file.

There are some special scenarios like timeline switch, backup or
.ready file created out of order which requires archiver to perform a
full directory scan as archiving these files takes precedence over
regular WAL files.
---
 src/backend/access/transam/xlogarchive.c |  23 ++++
 src/backend/postmaster/pgarch.c          | 211 ++++++++++++++++++++++++++++---
 src/backend/storage/lmgr/lwlocknames.txt |   1 +
 src/include/postmaster/pgarch.h          |   4 +
 4 files changed, 224 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index b9c19b2..cb73895 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -465,12 +465,16 @@ KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
  *
  * Optionally, nudge the archiver process so that it'll notice the file we
  * create.
+ *
+ * Also, notifies archiver to enable directory scan to handle a few special
+ * scenarios.
  */
 void
 XLogArchiveNotify(const char *xlog, bool nudge)
 {
 	char		archiveStatusPath[MAXPGPATH];
 	FILE	   *fd;
+	bool		fileOutOfOrder = false;
 
 	/* insert an otherwise empty file called <XLOG>.ready */
 	StatusFilePath(archiveStatusPath, xlog, ".ready");
@@ -492,6 +496,25 @@ XLogArchiveNotify(const char *xlog, bool nudge)
 		return;
 	}
 
+	/* Check if .ready file is created out of order */
+	if (IsXLogFileName(xlog))
+	{
+		XLogSegNo	curSegNo;
+		TimeLineID	tli;
+
+		XLogFromFileName(xlog, &tli, &curSegNo, wal_segment_size);
+
+		fileOutOfOrder = PgArchIsBrokenReadyFileOrder(curSegNo);
+	}
+
+	/*
+	 * History files or a .ready file created out of order requires archiver to
+	 * perform a full directory scan.
+	 */
+	if (IsTLHistoryFileName(xlog) || IsBackupHistoryFileName(xlog) ||
+			fileOutOfOrder)
+		PgArchEnableDirScan();
+
 	/* If caller requested, let archiver know it's got work to do */
 	if (nudge)
 		PgArchWakeup();
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 74a7d7c..a33648a 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -76,8 +76,31 @@
 typedef struct PgArchData
 {
 	int			pgprocno;		/* pgprocno of archiver process */
+
+	/*
+	 * Flag to enable/disable directory scan. If this flag is set then it
+	 * forces archiver to perform a full directory scan to get the next log
+	 * segment. It is not required to synchronize this flag as it guarantees
+	 * directory scan for the next cycle even if it is being missed in current
+	 * cycle.
+	 */
+	bool		dirScan;
+
+	/*
+	 * Segment number of the most recent .ready file found by archiver,
+	 * protected by WALArchiveLock.
+	 */
+	XLogSegNo	lastReadySegNo;
 } PgArchData;
 
+/*
+ * Segment number and timeline ID to identify the next file in a WAL sequence
+ */
+typedef struct readyXLogState
+{
+	XLogSegNo	lastSegNo;
+	TimeLineID	lastTLI;
+} readyXLogState;
 
 /* ----------
  * Local data
@@ -97,12 +120,13 @@ static volatile sig_atomic_t ready_to_stop = false;
  */
 static void pgarch_waken_stop(SIGNAL_ARGS);
 static void pgarch_MainLoop(void);
-static void pgarch_ArchiverCopyLoop(void);
+static void pgarch_ArchiverCopyLoop(readyXLogState *xlogState);
 static bool pgarch_archiveXlog(char *xlog);
-static bool pgarch_readyXlog(char *xlog);
+static bool pgarch_readyXlog(char *xlog, readyXLogState *xlogState);
 static void pgarch_archiveDone(char *xlog);
 static void pgarch_die(int code, Datum arg);
 static void HandlePgArchInterrupts(void);
+static void PgArchUpdateLastReadySegNo(XLogSegNo segNo);
 
 /* Report shared memory space needed by PgArchShmemInit */
 Size
@@ -221,6 +245,43 @@ PgArchWakeup(void)
 		SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch);
 }
 
+/*
+ * Set dirScan flag in shared memory. Backend notifies archiver in case if an
+ * action requires full directory scan to get the next log segment.
+ */
+void
+PgArchEnableDirScan(void)
+{
+	PgArch->dirScan = true;
+}
+
+/* Check if a .ready file is created out of order */
+bool
+PgArchIsBrokenReadyFileOrder(XLogSegNo curSegNo)
+{
+	XLogSegNo		lastSegNo;
+
+	/*
+	 * Compare the segment number of last .ready file found by archiver with
+	 * the segment number of current .ready file.
+	 */
+	LWLockAcquire(WALArchiveLock, LW_EXCLUSIVE);
+	lastSegNo = PgArch->lastReadySegNo;
+	LWLockRelease(WALArchiveLock);
+
+	return (curSegNo < lastSegNo) ? true : false;
+}
+
+/*
+ * Update segment number of last .ready file in shared memory
+ */
+static void
+PgArchUpdateLastReadySegNo(XLogSegNo segNo)
+{
+	LWLockAcquire(WALArchiveLock, LW_EXCLUSIVE);
+	PgArch->lastReadySegNo = segNo;
+	LWLockRelease(WALArchiveLock);
+}
 
 /* SIGUSR2 signal handler for archiver process */
 static void
@@ -243,10 +304,21 @@ pgarch_waken_stop(SIGNAL_ARGS)
 static void
 pgarch_MainLoop(void)
 {
+	readyXLogState xlogState;
 	pg_time_t	last_copy_time = 0;
 	bool		time_to_stop;
 
 	/*
+	 * Initialize xlogState, segment number and TLI will be reset/updated in
+	 * function pgarch_readyXlog() for each cycle.
+	 */
+	xlogState.lastSegNo = 0;
+	xlogState.lastTLI = 0;
+
+	/* First cycle after startup */
+	PgArchEnableDirScan();
+
+	/*
 	 * There shouldn't be anything for the archiver to do except to wait for a
 	 * signal ... however, the archiver exists to protect our data, so she
 	 * wakes up occasionally to allow herself to be proactive.
@@ -280,7 +352,7 @@ pgarch_MainLoop(void)
 		}
 
 		/* Do what we're here for */
-		pgarch_ArchiverCopyLoop();
+		pgarch_ArchiverCopyLoop(&xlogState);
 		last_copy_time = time(NULL);
 
 		/*
@@ -321,7 +393,7 @@ pgarch_MainLoop(void)
  * Archives all outstanding xlogs then returns
  */
 static void
-pgarch_ArchiverCopyLoop(void)
+pgarch_ArchiverCopyLoop(readyXLogState *xlogState)
 {
 	char		xlog[MAX_XFN_CHARS + 1];
 
@@ -331,7 +403,7 @@ pgarch_ArchiverCopyLoop(void)
 	 * some backend will add files onto the list of those that need archiving
 	 * while we are still copying earlier archives
 	 */
-	while (pgarch_readyXlog(xlog))
+	while (pgarch_readyXlog(xlog, xlogState))
 	{
 		int			failures = 0;
 		int			failures_orphan = 0;
@@ -596,29 +668,96 @@ pgarch_archiveXlog(char *xlog)
  * larger ID; the net result being that past timelines are given higher
  * priority for archiving.  This seems okay, or at least not obviously worth
  * changing.
+ *
+ * WAL files are generated in a specific order of log segment number. The
+ * directory scan for each WAL file can be minimised by identifying the next
+ * WAL file in the sequence. This can be achieved by maintaining log segment
+ * number and timeline ID corresponding to WAL file currently being archived.
+ * The log segment number of current WAL file can be incremented by '1' to
+ * point to the next WAL file in a sequence. Full directory scan can be avoided
+ * by checking the availability of next WAL file. "xlogState" specifies the
+ * segment number and timeline ID corresponding to the next WAL file.
+ *
+ * However, a full directory scan is performed in some special cases where it
+ * requires us to archive files which takes precedence over the next anticipated
+ * log segment. For example, history file takes precedence over archiving WAL
+ * files on older timeline or an older WAL file which is being left out because
+ * corresponding .ready file is created out of order or archiving a backup
+ * history file created during backup.
+ *
+ * Returns "true" if a segment is ready for archival, "xlog" represents the
+ * name of the segment.
  */
 static bool
-pgarch_readyXlog(char *xlog)
+pgarch_readyXlog(char *xlog, readyXLogState *xlogState)
 {
-	/*
-	 * open xlog status directory and read through list of xlogs that have the
-	 * .ready suffix, looking for earliest file. It is possible to optimise
-	 * this code, though only a single file is expected on the vast majority
-	 * of calls, so....
-	 */
+	char		basename[MAX_XFN_CHARS + 1];
+	char		xlogready[MAXPGPATH];
 	char		XLogArchiveStatusDir[MAXPGPATH];
 	DIR		   *rldir;
 	struct dirent *rlde;
+	struct stat	st;
 	bool		found = false;
 	bool		historyFound = false;
 
+	/*
+	 * Skip directory scan until it is not indicated by shared memory flag
+	 * dirScan.
+	 */
+	if (!PgArch->dirScan)
+	{
+		/*
+		 * We already have the next anticipated log segment and timeline, check
+		 * if this WAL is ready to be archived.
+		 */
+		XLogFileName(basename, xlogState->lastTLI, xlogState->lastSegNo, wal_segment_size);
+		StatusFilePath(xlogready, basename, ".ready");
+
+		if (stat(xlogready, &st) == 0)
+		{
+			strcpy(xlog, basename);
+
+			/* Found .ready file, update it's segment number in shared memory */
+			PgArchUpdateLastReadySegNo(xlogState->lastSegNo);
+
+			/*
+			 * Increment the readyXLogState's lastSegNo to point to the next
+			 * WAL file. Although we have not yet archived the current WAL file
+			 * and readyXLogState points to the next WAL file, this is safe
+			 * because the next cycle will not begin until we finish archiving
+			 * current WAL file.
+			 */
+			xlogState->lastSegNo++;
+			return true;
+		}
+	}
+
+	/*
+	 * Perform a full directory scan to identify the next log segment. There
+	 * may be one of the following scenarios which may require us to perform a
+	 * full directory scan.
+	 *
+	 * - This is the first cycle since archiver has started and there is no
+	 *   idea about the next anticipated log segment.
+	 *
+	 * - There is a timeline switch, archive history file as part of this
+	 *   timeline switch.
+	 *
+	 * - .ready file is created out of order.
+	 *
+	 * - A backup history file created during backup.
+	 *
+	 * - The next anticipated log segment is not available.
+	 *
+	 * open xlog status directory and read through list of xlogs that have the
+	 * .ready suffix, looking for earliest file.
+	 */
 	snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status");
 	rldir = AllocateDir(XLogArchiveStatusDir);
 
 	while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL)
 	{
 		int			basenamelen = (int) strlen(rlde->d_name) - 6;
-		char		basename[MAX_XFN_CHARS + 1];
 		bool		ishistory;
 
 		/* Ignore entries with unexpected number of characters */
@@ -638,8 +777,14 @@ pgarch_readyXlog(char *xlog)
 		memcpy(basename, rlde->d_name, basenamelen);
 		basename[basenamelen] = '\0';
 
-		/* Is this a history file? */
-		ishistory = IsTLHistoryFileName(basename);
+		/*
+		 * Archiving timeline history file takes precedence over WAL file but
+		 * if this directory scan has been enabled to archive a backup history
+		 * file then archive the backup history file on priority before the
+		 * directory scan gets disabled as part archiving regular WAL file.
+		 */
+		ishistory = IsTLHistoryFileName(basename) ||
+			IsBackupHistoryFileName(basename);
 
 		/*
 		 * Consume the file to archive.  History files have the highest
@@ -661,6 +806,42 @@ pgarch_readyXlog(char *xlog)
 				strcpy(xlog, basename);
 		}
 	}
+
+	if (found)
+	{
+		if (IsXLogFileName(xlog))
+		{
+			/*
+			 * Reset the flag only when we found a regular WAL file to make
+			 * sure that we are done with processing history files.
+			 */
+			PgArch->dirScan = false;
+
+			/*
+			 * Make sure that the flag reset is flushed to memory before it is
+			 * examined or set for the next cycle.
+			 */
+			pg_memory_barrier();
+
+			/*
+			 * Reset segment number and timeline ID as this is the beginning of a
+			 * new sequence.
+			 */
+			XLogFromFileName(xlog, &xlogState->lastTLI, &xlogState->lastSegNo,
+					wal_segment_size);
+
+			/* Update last segment number in shared memory */
+			PgArchUpdateLastReadySegNo(xlogState->lastSegNo);
+
+			/* Increment log segment number to point to the next WAL file */
+			xlogState->lastSegNo++;
+		}
+
+		ereport(LOG,
+				(errmsg("directory scan to archive write-ahead log file \"%s\"",
+						xlog)));
+	}
+
 	FreeDir(rldir);
 
 	return found;
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c..ced38f8 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+WALArchiveLock						48
diff --git a/src/include/postmaster/pgarch.h b/src/include/postmaster/pgarch.h
index 1e47a14..5800fa8 100644
--- a/src/include/postmaster/pgarch.h
+++ b/src/include/postmaster/pgarch.h
@@ -13,6 +13,8 @@
 #ifndef _PGARCH_H
 #define _PGARCH_H
 
+#include "access/xlogdefs.h"
+
 /* ----------
  * Archiver control info.
  *
@@ -31,5 +33,7 @@ extern void PgArchShmemInit(void);
 extern bool PgArchCanRestart(void);
 extern void PgArchiverMain(void) pg_attribute_noreturn();
 extern void PgArchWakeup(void);
+extern void PgArchEnableDirScan(void);
+extern bool PgArchIsBrokenReadyFileOrder(XLogSegNo curSegNo);
 
 #endif							/* _PGARCH_H */
-- 
1.8.3.1

Reply via email to