On Fri, Mar 19, 2021 at 2:16 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> PS: For illustration/discussion, I've also attached a "none" patch.  I
> also couldn't resist rebasing my "wal" mode patch, which I plan to
> propose for PG15 because there is not enough time left for this
> release.

Erm... I attached the wrong version by mistake.  Here's a better one.
(Note: I'm not expecting any review of the 0003 patch in this CF, I
just wanted to share the future direction I'd like to consider for
this problem.)
From a5faf47f720684bd7952ecf0ca0c495d9ec14989 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 27 Sep 2020 17:22:10 +1300
Subject: [PATCH v5 1/3] Provide recovery_init_sync_method=syncfs.

On some systems, opening every file in the data directory can be very
slow, as the first step before crash recovery can begin.  Provide an
alternative method, for Linux only, where we call syncfs() on every
possibly different filesystem under the data directory.  This avoids
faulting in inodes for potentially very many inodes.  It comes with some
caveats, so it's not the default.

Reported-by: Michael Brown <michael.br...@discourse.org>
Reviewed-by: Fujii Masao <masao.fu...@oss.nttdata.com>
Reviewed-by: Paul Guo <gu...@vmware.com>
Reviewed-by: Bruce Momjian <br...@momjian.us>
Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org
Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com
---
 configure                                     |  2 +-
 configure.ac                                  |  1 +
 doc/src/sgml/config.sgml                      | 30 +++++++++
 src/backend/storage/file/fd.c                 | 61 ++++++++++++++++++-
 src/backend/utils/misc/guc.c                  | 17 ++++++
 src/backend/utils/misc/postgresql.conf.sample |  1 +
 src/include/pg_config.h.in                    |  3 +
 src/include/storage/fd.h                      |  6 ++
 src/tools/msvc/Solution.pm                    |  1 +
 9 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 3fd4cecbeb..6a2051da9d 100755
--- a/configure
+++ b/configure
@@ -15239,7 +15239,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l writev
+for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l writev
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 2f1585adc0..dd819ab2c2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1681,6 +1681,7 @@ AC_CHECK_FUNCS(m4_normalize([
 	strchrnul
 	strsignal
 	symlink
+	syncfs
 	sync_file_range
 	uselocale
 	wcstombs_l
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 863ac31c6b..32de8235bf 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9721,6 +9721,36 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-recovery-init-sync-method" xreflabel="recovery_init_sync_method">
+      <term><varname>recovery_init_sync_method</varname> (<type>enum</type>)
+       <indexterm>
+        <primary><varname>recovery_init_sync_method</varname> configuration parameter</primary>
+       </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When set to <literal>fsync</literal>, which is the default,
+        <productname>PostgreSQL</productname> will recursively open and fsync
+        all files in the data directory before crash recovery begins.  This
+        is intended to make sure that all WAL and data files are durably stored
+        on disk before replaying changes.  This applies whenever starting a
+        database cluster that did not shut down cleanly, including copies
+        created with <application>pg_basebackup</application>.
+       </para>
+       <para>
+        On Linux, <literal>syncfs</literal> may be used instead, to ask the
+        operating system to flush the whole file system.  This may be a lot
+        faster, because it doesn't need to open each file one by one.  On the
+        other hand, it may be slower if the file system is shared by other
+        applications that modify a lot of files, since those files will also
+        be flushed to disk.  Furthermore, on versions of Linux before 5.8, I/O
+        errors encountered while writing data to disk may not be reported to
+        <productname>PostgreSQL</productname>, and relevant error messages may
+        appear only in kernel logs.
+       </para>
+      </listitem>
+     </varlistentry>
+
     </variablelist>
 
    </sect1>
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 110ba31517..14a07f09a4 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -72,9 +72,11 @@
 
 #include "postgres.h"
 
+#include <dirent.h>
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #ifndef WIN32
 #include <sys/mman.h>
 #endif
@@ -158,6 +160,9 @@ int			max_safe_fds = FD_MINFREE;	/* default if not changed */
 /* Whether it is safe to continue running after fsync() fails. */
 bool		data_sync_retry = false;
 
+/* How SyncDataDirectory() should do its job. */
+int			recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
+
 /* Debugging.... */
 
 #ifdef FDDEBUG
@@ -3265,9 +3270,27 @@ looks_like_temp_rel_name(const char *name)
 	return true;
 }
 
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int		fd;
+
+	fd = OpenTransientFile(path, O_RDONLY);
+	if (fd < 0)
+	{
+		elog(LOG, "could not open %s: %m", path);
+		return;
+	}
+	if (syncfs(fd) < 0)
+		elog(LOG, "could not sync filesystem for \"%s\": %m", path);
+	CloseTransientFile(fd);
+}
+#endif
 
 /*
- * Issue fsync recursively on PGDATA and all its contents.
+ * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
+ * all potential filesystem, depending on recovery_init_sync_method setting.
  *
  * We fsync regular files and directories wherever they are, but we
  * follow symlinks only for pg_wal and immediately under pg_tblspc.
@@ -3319,6 +3342,42 @@ SyncDataDirectory(void)
 		xlog_is_symlink = true;
 #endif
 
+#ifdef HAVE_SYNCFS
+	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
+	{
+		DIR		   *dir;
+		struct dirent *de;
+
+		/*
+		 * On Linux, we don't have to open every single file one by one.  We
+		 * can use syncfs() to sync whole filesystems.  We only expect
+		 * filesystem boundaries to exist where we tolerate symlinks, namely
+		 * pg_wal and the tablespaces, so we call syncfs() for each of those
+		 * directories.
+		 */
+
+		/* Sync the top level pgdata directory. */
+		do_syncfs(".");
+		/* If any tablespaces are configured, sync each of those. */
+		dir = AllocateDir("pg_tblspc");
+		while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
+		{
+			char		path[MAXPGPATH];
+
+			if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+				continue;
+
+			snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+			do_syncfs(path);
+		}
+		FreeDir(dir);
+		/* If pg_wal is a symlink, process that too. */
+		if (xlog_is_symlink)
+			do_syncfs("pg_wal");
+		return;
+	}
+#endif		/* !HAVE_SYNCFS */
+
 	/*
 	 * If possible, hint to the kernel that we're soon going to fsync the data
 	 * directory and its contents.  Errors in this step are even less
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5a3ca5b765..f7aafdf772 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -487,6 +487,14 @@ const struct config_enum_entry ssl_protocol_versions_info[] = {
 StaticAssertDecl(lengthof(ssl_protocol_versions_info) == (PG_TLS1_3_VERSION + 2),
 				 "array length mismatch");
 
+static struct config_enum_entry recovery_init_sync_method_options[] = {
+	{"fsync", RECOVERY_INIT_SYNC_METHOD_FSYNC, false},
+#ifdef HAVE_SYNCFS
+	{"syncfs", RECOVERY_INIT_SYNC_METHOD_SYNCFS, false},
+#endif
+	{NULL, 0, false}
+};
+
 static struct config_enum_entry shared_memory_options[] = {
 #ifndef WIN32
 	{"sysv", SHMEM_TYPE_SYSV, false},
@@ -4859,6 +4867,15 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"recovery_init_sync_method", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS,
+			gettext_noop("Sets the method for synchronizing the data directory before crash recovery."),
+		},
+		&recovery_init_sync_method,
+		RECOVERY_INIT_SYNC_METHOD_FSYNC, recovery_init_sync_method_options,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 3ff507d5f6..76b9e815a2 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -760,6 +760,7 @@
 #restart_after_crash = on		# reinitialize after backend crash?
 #remove_temp_files_after_crash = on	# remove temporary files after
 					# backend crash?
+#recovery_init_sync_method = fsync	# fsync, syncfs (Linux 5.8+)
 #data_sync_retry = off			# retry or panic on failure to fsync
 					# data?
 					# (change requires restart)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 7a7cc21d8d..7bb5743e3d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -590,6 +590,9 @@
 /* Define to 1 if you have the `symlink' function. */
 #undef HAVE_SYMLINK
 
+/* Define to 1 if you have the `syncfs' function. */
+#undef HAVE_SYNCFS
+
 /* Define to 1 if you have the `sync_file_range' function. */
 #undef HAVE_SYNC_FILE_RANGE
 
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 30bf7d2193..328473bdc9 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -45,6 +45,11 @@
 
 #include <dirent.h>
 
+typedef enum RecoveryInitSyncMethod {
+	RECOVERY_INIT_SYNC_METHOD_FSYNC,
+	RECOVERY_INIT_SYNC_METHOD_SYNCFS
+} RecoveryInitSyncMethod;
+
 struct iovec;					/* avoid including port/pg_iovec.h here */
 
 typedef int File;
@@ -53,6 +58,7 @@ typedef int File;
 /* GUC parameter */
 extern PGDLLIMPORT int max_files_per_process;
 extern PGDLLIMPORT bool data_sync_retry;
+extern int recovery_init_sync_method;
 
 /*
  * This is private to fd.c, but exported for save/restore_backend_variables()
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index a4f5cc4bdb..43d171e544 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -388,6 +388,7 @@ sub GenerateFiles
 		HAVE_STRUCT_TM_TM_ZONE                   => undef,
 		HAVE_SYNC_FILE_RANGE                     => undef,
 		HAVE_SYMLINK                             => 1,
+		HAVE_SYNCFS                              => undef,
 		HAVE_SYSLOG                              => undef,
 		HAVE_SYS_EPOLL_H                         => undef,
 		HAVE_SYS_EVENT_H                         => undef,
-- 
2.30.1

From 9743a09cb3c0817dc0aeedb0df9d8aafb1454f98 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 19 Mar 2021 13:06:35 +1300
Subject: [PATCH v5 2/3] Provide recovery_init_sync_method=none.

Although it's rife with hazards and not a good option to use in general,
some expert users may want to be able to disable the initial sync
performed before crash recovery.  Explicitly discouraged in the
documentation.

Discussion https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org
---
 doc/src/sgml/config.sgml                      | 8 ++++++++
 src/backend/storage/file/fd.c                 | 4 ++++
 src/backend/utils/misc/guc.c                  | 1 +
 src/backend/utils/misc/postgresql.conf.sample | 2 +-
 src/include/storage/fd.h                      | 3 ++-
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 32de8235bf..e547fcebd8 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9748,6 +9748,14 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
         <productname>PostgreSQL</productname>, and relevant error messages may
         appear only in kernel logs.
        </para>
+       <para>
+        The option <literal>none</literal> can be specified to skip the step.
+        This is only safe if all buffered data is known to have been flushed
+        to disk already, for example by a tool such as
+        <application>pg_basebackup</application>.  It is not a good idea to
+        leave this option configured permanently in general, as it will
+        affect future crash recovery cycles.
+       </para>
       </listitem>
      </varlistentry>
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 14a07f09a4..63ef7b6a92 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -3319,6 +3319,10 @@ SyncDataDirectory(void)
 	if (!enableFsync)
 		return;
 
+	/* Likewise if this specific sync step is disabled. */
+	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_NONE)
+		return;
+
 	/*
 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
 	 * because the first walkdir below will ignore it.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f7aafdf772..f3a8e8e92e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -492,6 +492,7 @@ static struct config_enum_entry recovery_init_sync_method_options[] = {
 #ifdef HAVE_SYNCFS
 	{"syncfs", RECOVERY_INIT_SYNC_METHOD_SYNCFS, false},
 #endif
+	{"none", RECOVERY_INIT_SYNC_METHOD_NONE, false},
 	{NULL, 0, false}
 };
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 76b9e815a2..8b1610f6ec 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -760,7 +760,7 @@
 #restart_after_crash = on		# reinitialize after backend crash?
 #remove_temp_files_after_crash = on	# remove temporary files after
 					# backend crash?
-#recovery_init_sync_method = fsync	# fsync, syncfs (Linux 5.8+)
+#recovery_init_sync_method = fsync	# fsync, syncfs (Linux 5.8+), none
 #data_sync_retry = off			# retry or panic on failure to fsync
 					# data?
 					# (change requires restart)
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 328473bdc9..dca44c38c4 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -47,7 +47,8 @@
 
 typedef enum RecoveryInitSyncMethod {
 	RECOVERY_INIT_SYNC_METHOD_FSYNC,
-	RECOVERY_INIT_SYNC_METHOD_SYNCFS
+	RECOVERY_INIT_SYNC_METHOD_SYNCFS,
+	RECOVERY_INIT_SYNC_METHOD_NONE
 } RecoveryInitSyncMethod;
 
 struct iovec;					/* avoid including port/pg_iovec.h here */
-- 
2.30.1

From 04970c321e9969862f3a3f2107657dc3aa6469d9 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 26 Sep 2020 23:07:48 +1200
Subject: [PATCH v5 3/3] Provide recovery_init_sync_method=wal.

It's important to avoid relying on dirty data in the kernel's cache when
deciding that a change has already been applied.

The existing options have disadvantages:

 * recovery_init_sync_method=fsync has to open every file, which
   can be very slow for millions of files on high latency storage.
 * recovery_init_sync_method=syncfs is Linux-only, doesn't report
   errors on older versions and also syncs data outside pgdata
 * recovery_init_sync_method=none requires human action to make
   sure data is synced, and even when that's done correctly, it applies
   to all future crash recovery cycles until the setting is changed

This new option syncs only the WAL files, and then relies on analysis of
the WAL during replay.  We start by assuming that modifications from
before the last checkpoint were already flushed by the checkpoint.  That
is true if you crashed or ran pg_basebackup to the present location, but
false if you made a second copy with non-syncing tools.  Then, if
full_page_writes=on, we expect all data modified since the last
checkpoint to be written to disk as part of the end-of-recovery
checkpoint.  If full_page_writes=off, things are slightly trickier: we
skip updating pages that have an LSN higher than a WAL record
("BLK_DONE"), but we don't trust any data we read from the kernel's
cache to be really on disk, so we skip replay but we still make a note
to sync the file at the next checkpoint anyway.

(A slightly more paranoid mode would mark skipped pages dirty instead,
to trigger a rewrite of all pages we skip, if our threat model includes
operating systems that might mark their own page cache pages "clean"
rather than invalidating them after a writeback failure.  That is not
done in this commit.)

Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org
Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      |  8 ++-
 src/backend/access/transam/xlog.c             | 52 +++++++++++++------
 src/backend/access/transam/xlogutils.c        | 11 ++++
 src/backend/storage/buffer/bufmgr.c           | 20 +++++++
 src/backend/storage/file/fd.c                 | 12 +++--
 src/backend/storage/smgr/md.c                 | 15 ++++++
 src/backend/storage/smgr/smgr.c               | 15 ++++++
 src/backend/utils/misc/guc.c                  |  3 +-
 src/backend/utils/misc/postgresql.conf.sample |  2 +-
 src/include/storage/bufmgr.h                  |  1 +
 src/include/storage/fd.h                      |  1 +
 src/include/storage/md.h                      |  1 +
 src/include/storage/smgr.h                    |  2 +
 13 files changed, 120 insertions(+), 23 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e547fcebd8..7d4d59a02f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9729,7 +9729,13 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
       </term>
       <listitem>
        <para>
-        When set to <literal>fsync</literal>, which is the default,
+        When set to <literal>wal</literal>, which is the default,
+        <productname>PostgreSQL</productname> will fsync all WAL files before
+        recovery begins, and rely on WAL replay to make sure all modified
+        data files are handle correctly.
+       </para>
+       <para>
+        When set to <literal>fsync</literal>,
         <productname>PostgreSQL</productname> will recursively open and fsync
         all files in the data directory before crash recovery begins.  This
         is intended to make sure that all WAL and data files are durably stored
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6f8810e149..89678e6470 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -927,7 +927,7 @@ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
 static void PreallocXlogFiles(XLogRecPtr endptr);
-static void RemoveTempXlogFiles(void);
+static void ScanXlogFilesAfterCrash(void);
 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
 static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
 						   XLogSegNo *endlogSegNo);
@@ -4030,13 +4030,15 @@ UpdateLastRemovedPtr(char *filename)
 }
 
 /*
- * Remove all temporary log files in pg_wal
+ * Remove all temporary log files in pg_wal, and make sure that all remaining
+ * files are down on disk before we replay anything in them if
+ * recovery_init_sync_method requires that.
  *
  * This is called at the beginning of recovery after a previous crash,
  * at a point where no other processes write fresh WAL data.
  */
 static void
-RemoveTempXlogFiles(void)
+ScanXlogFilesAfterCrash(void)
 {
 	DIR		   *xldir;
 	struct dirent *xlde;
@@ -4048,12 +4050,23 @@ RemoveTempXlogFiles(void)
 	{
 		char		path[MAXPGPATH];
 
-		if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
-			continue;
-
 		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
-		unlink(path);
-		elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
+
+		if (strncmp(xlde->d_name, "xlogtemp.", 9) == 0)
+		{
+			unlink(path);
+			elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
+		}
+		else if (IsXLogFileName(xlde->d_name) &&
+				 recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_WAL)
+		{
+			/*
+			 * Make sure that whatever we read from this WAL file is durably on
+			 * disk before replaying in RECOVERY_INIT_SYNC_METHOD_WAL mode.
+			 * Necessary because SyncDataDirectory() won't do that for us.
+			 */
+			fsync_fname(path, false);
+		}
 	}
 	FreeDir(xldir);
 }
@@ -6547,23 +6560,30 @@ StartupXLOG(void)
 	ValidateXLOGDirectoryStructure();
 
 	/*----------
-	 * If we previously crashed, perform a couple of actions:
+	 * If we previously crashed:
 	 *
 	 * - The pg_wal directory may still include some temporary WAL segments
 	 *   used when creating a new segment, so perform some clean up to not
-	 *   bloat this path.  This is done first as there is no point to sync
-	 *   this temporary data.
+	 *   bloat this path, in ScanXlogFilesAfterCrash().
+	 *
+	 * - It's possible that some WAL data had been written but not yet synced.
+	 *   Therefore we have to sync these files before replaying the records
+	 *   they contain.  If recovery_init_wal_sync_method=wal we'll do that
+	 *   in ScanXlogFilesAfterCrash(); otherwise we'll leave it up to
+	 *   SyncDataDirectory().
 	 *
 	 * - There might be data which we had written, intending to fsync it, but
-	 *   which we had not actually fsync'd yet.  Therefore, a power failure in
-	 *   the near future might cause earlier unflushed writes to be lost, even
-	 *   though more recent data written to disk from here on would be
-	 *   persisted.  To avoid that, fsync the entire data directory.
+	 *   which we had not actually fsync'd yet.  If
+	 *   recovery_init_wal_sync_method=wal, then XLogReadBufferForRedo() will
+	 *   compute the set of files that may need fsyncing at the next
+	 *   checkpoint, during recovery.  Otherwise, SyncDataDirectory() will
+	 *   sync the entire pgdata directory up front, to avoid relying on data
+	 *   from the kernel's cache that hasn't reached disk yet.
 	 */
 	if (ControlFile->state != DB_SHUTDOWNED &&
 		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
 	{
-		RemoveTempXlogFiles();
+		ScanXlogFilesAfterCrash();
 		SyncDataDirectory();
 	}
 
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index d17d660f46..85303e761e 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -401,7 +401,18 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 					LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
 			}
 			if (lsn <= PageGetLSN(BufferGetPage(*buf)))
+			{
+				/*
+				 * The page is from the future.  We must be in crash recovery.
+				 * We don't need to redo the record, but we do need to make
+				 * sure that the image as we have seen it is durably stored on
+				 * disk as part of the next checkpoint, unless that was already
+				 * done by SyncDataDirectory().
+				 */
+				if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_WAL)
+					RequestBufferSync(*buf);
 				return BLK_DONE;
+			}
 			else
 				return BLK_NEEDS_REDO;
 		}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 852138f9c9..6393545f0c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1530,6 +1530,26 @@ MarkBufferDirty(Buffer buffer)
 	}
 }
 
+/*
+ * Request that the file containing a buffer be fsynced at the next checkpoint.
+ * This is only used in crash recovery, to make it safe to skip applying WAL on
+ * the basis that the page already has a change.  We want to make sure that the
+ * data we read from the kernel matches what's on disk at the next checkpoint.
+ */
+void
+RequestBufferSync(Buffer buffer)
+{
+	SMgrRelation reln;
+	BufferDesc *bufHdr;
+
+	Assert(BufferIsPinned(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+	reln = smgropen(bufHdr->tag.rnode, InvalidBackendId);
+	smgrsync(reln, bufHdr->tag.forkNum, bufHdr->tag.blockNum);
+}
+
 /*
  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
  *
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 63ef7b6a92..e20ba9c00b 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -336,6 +336,7 @@ static void walkdir(const char *path,
 					void (*action) (const char *fname, bool isdir, int elevel),
 					bool process_symlinks,
 					int elevel);
+
 #ifdef PG_FLUSH_DATA_WORKS
 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 #endif
@@ -3290,7 +3291,7 @@ do_syncfs(const char *path)
 
 /*
  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
- * all potential filesystem, depending on recovery_init_sync_method setting.
+ * all potential filesystem, depending on the recovery_init_sync_method setting.
  *
  * We fsync regular files and directories wherever they are, but we
  * follow symlinks only for pg_wal and immediately under pg_tblspc.
@@ -3319,8 +3320,12 @@ SyncDataDirectory(void)
 	if (!enableFsync)
 		return;
 
-	/* Likewise if this specific sync step is disabled. */
-	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_NONE)
+	/*
+	 * Likewise if this specific sync step is disabled, or if we're using WAL
+	 * analysis instead.
+	 */
+	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_NONE ||
+		recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_WAL)
 		return;
 
 	/*
@@ -3478,7 +3483,6 @@ walkdir(const char *path,
 		(*action) (path, true, elevel);
 }
 
-
 /*
  * Hint to the OS that it should get ready to fsync() this file.
  *
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 1e12cfad8e..d74506be31 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -961,6 +961,21 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	}
 }
 
+/*
+ *	mdsync() -- Schedule a sync at the next checkpoint.
+ *
+ * This is useful in crash recovery, to ensure that data we've read from the
+ * kernel matches what is on disk before a checkpoint.
+ */
+void
+mdsync(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	MdfdVec	   *seg;
+
+	seg = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
+	register_dirty_segment(reln, forknum, seg);
+}
+
 /*
  * register_dirty_segment() -- Mark a relation segment as needing fsync
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4dc24649df..6856c40300 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -62,6 +62,8 @@ typedef struct f_smgr
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_sync) (SMgrRelation reln, ForkNumber forknum,
+							  BlockNumber blocknum);
 } f_smgr;
 
 static const f_smgr smgrsw[] = {
@@ -79,6 +81,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_read = mdread,
 		.smgr_write = mdwrite,
 		.smgr_writeback = mdwriteback,
+		.smgr_sync = mdsync,
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
@@ -540,6 +543,18 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 											nblocks);
 }
 
+
+/*
+ *	smgrsync() -- Request that the file containing a block be flushed at the
+ *				  next checkpoint.
+ */
+void
+smgrsync(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	smgrsw[reln->smgr_which].smgr_sync(reln, forknum, blocknum);
+}
+
+
 /*
  *	smgrnblocks() -- Calculate the number of blocks in the
  *					 supplied relation.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f3a8e8e92e..6e73e7b38c 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -488,6 +488,7 @@ StaticAssertDecl(lengthof(ssl_protocol_versions_info) == (PG_TLS1_3_VERSION + 2)
 				 "array length mismatch");
 
 static struct config_enum_entry recovery_init_sync_method_options[] = {
+	{"wal", RECOVERY_INIT_SYNC_METHOD_WAL, false},
 	{"fsync", RECOVERY_INIT_SYNC_METHOD_FSYNC, false},
 #ifdef HAVE_SYNCFS
 	{"syncfs", RECOVERY_INIT_SYNC_METHOD_SYNCFS, false},
@@ -4873,7 +4874,7 @@ static struct config_enum ConfigureNamesEnum[] =
 			gettext_noop("Sets the method for synchronizing the data directory before crash recovery."),
 		},
 		&recovery_init_sync_method,
-		RECOVERY_INIT_SYNC_METHOD_FSYNC, recovery_init_sync_method_options,
+		RECOVERY_INIT_SYNC_METHOD_WAL, recovery_init_sync_method_options,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 8b1610f6ec..96107d641e 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -760,7 +760,7 @@
 #restart_after_crash = on		# reinitialize after backend crash?
 #remove_temp_files_after_crash = on	# remove temporary files after
 					# backend crash?
-#recovery_init_sync_method = fsync	# fsync, syncfs (Linux 5.8+), none
+#recovery_init_sync_method = wal	# wal, fsync, syncfs (Linux 5.8+), none
 #data_sync_retry = off			# retry or panic on failure to fsync
 					# data?
 					# (change requires restart)
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index fb00fda6a7..f5409264da 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -186,6 +186,7 @@ extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
+extern void RequestBufferSync(Buffer buffer);
 extern void IncrBufferRefCount(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 								   BlockNumber blockNum);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index dca44c38c4..801b2a4601 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -46,6 +46,7 @@
 #include <dirent.h>
 
 typedef enum RecoveryInitSyncMethod {
+	RECOVERY_INIT_SYNC_METHOD_WAL,
 	RECOVERY_INIT_SYNC_METHOD_FSYNC,
 	RECOVERY_INIT_SYNC_METHOD_SYNCFS,
 	RECOVERY_INIT_SYNC_METHOD_NONE
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 752b440864..35e813410a 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -40,6 +40,7 @@ extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber nblocks);
 extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void mdsync(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum);
 
 extern void ForgetDatabaseSyncRequests(Oid dbid);
 extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a6fbf7b6a6..4a9cc9f181 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -103,6 +103,8 @@ extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 						 int nforks, BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void smgrsync(SMgrRelation reln, ForkNumber forknum,
+					 BlockNumber blocknum);
 extern void AtEOXact_SMgr(void);
 
 #endif							/* SMGR_H */
-- 
2.30.1

Reply via email to