On Thu, Mar 11, 2021 at 2:32 PM Thomas Munro <thomas.mu...@gmail.com> wrote: > On Thu, Mar 11, 2021 at 2:25 PM Tom Lane <t...@sss.pgh.pa.us> wrote: > > Trolling the net, I found a newer-looking version of the man page, > > and behold it says > > > > In mainline kernel versions prior to 5.8, syncfs() will fail only > > when passed a bad file descriptor (EBADF). Since Linux 5.8, > > syncfs() will also report an error if one or more inodes failed > > to be written back since the last syncfs() call. > > > > So this means that in less-than-bleeding-edge kernels, syncfs can > > only be regarded as a dangerous toy. If we expose an option to use > > it, there had better be large blinking warnings in the docs. > > Agreed. Perhaps we could also try to do something programmatic about that.
Time being of the essence, here is the patch I posted last year, this time with a GUC and some docs. You can set sync_after_crash to "fsync" (default) or "syncfs" if you have it. I would plan to extend that to include a third option as already discussed in the other thread, maybe something like "wal" (= sync WAL files and then do extra analysis of WAL data to sync only data modified since checkpoint but not replayed), but that'd be material for PG15.
From e1a14240a08b802b669c7a4d2a7cacc1004a7de4 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sun, 27 Sep 2020 17:22:10 +1300 Subject: [PATCH v2] Optionally use syncfs() for SyncDataDirectory() on Linux. Opening every file can be slow, as the first step before crash recovery can begin. Provide an alternative method, where we call syncfs() on every possibly different filesystem under the data directory. This avoids faulting in inodes for potentially very many inodes. The option can be controlled with a new GUC: sync_after_crash={fsync,syncfs} Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com --- configure | 2 +- configure.ac | 1 + doc/src/sgml/config.sgml | 28 +++++++++ src/backend/storage/file/fd.c | 62 ++++++++++++++++++- src/backend/utils/misc/guc.c | 17 +++++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/pg_config.h.in | 3 + src/include/storage/fd.h | 8 +++ 8 files changed, 120 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 3fd4cecbeb..6a2051da9d 100755 --- a/configure +++ b/configure @@ -15239,7 +15239,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l writev +for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink readv setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l writev do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 2f1585adc0..dd819ab2c2 100644 --- a/configure.ac +++ b/configure.ac @@ -1681,6 +1681,7 @@ AC_CHECK_FUNCS(m4_normalize([ strchrnul strsignal symlink + syncfs sync_file_range uselocale wcstombs_l diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index a218d78bef..f8bd82a729 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9679,6 +9679,34 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir' </listitem> </varlistentry> + <varlistentry id="guc-sync-after-crash" xreflabel="sync_after_crash"> + <term><varname>sync_after_crash</varname> (<type>enum</type>) + <indexterm> + <primary><varname>sync_after_crash</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + When set to <literal>fsync</literal>, which is the default, + <productname>PostgreSQL</productname> will recursively open and fsync + all files in the data directory before crash recovery begins. This + is intended to make sure that all WAL and data files are durably stored + on disk before replaying changes. + </para> + <para> + On Linux, <literal>syncfs</literal> may be used instead, to ask the + operating system to flush the whole file system. This may be a lot + faster, because it doesn't need to open each file one by one. On the + other hand, it may be slower if the file system is shared by other + applications that modify a lot of files, since those files will also + be flushed to disk. Furthermore, on version of Linux before 5.8, I/O + errors encountered while writing data to disk may not be reported to + <productname>PostgreSQL</productname>, and relevant error messages may + appear only in kernel logs. + </para> + </listitem> + </varlistentry> + </variablelist> </sect1> diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index b58502837a..74554c8617 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -72,9 +72,11 @@ #include "postgres.h" +#include <dirent.h> #include <sys/file.h> #include <sys/param.h> #include <sys/stat.h> +#include <sys/types.h> #ifndef WIN32 #include <sys/mman.h> #endif @@ -158,6 +160,9 @@ int max_safe_fds = FD_MINFREE; /* default if not changed */ /* Whether it is safe to continue running after fsync() fails. */ bool data_sync_retry = false; +/* How SyncDataDirectory should do its job. */ +int sync_after_crash = SYNC_AFTER_CRASH_FSYNC; + /* Debugging.... */ #ifdef FDDEBUG @@ -3263,9 +3268,27 @@ looks_like_temp_rel_name(const char *name) return true; } +#ifdef HAVE_SYNCFS +static void +do_syncfs(const char *path) +{ + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) + { + elog(LOG, "could not open %s: %m", path); + return; + } + if (syncfs(fd) < 0) + elog(LOG, "syncfs failed for %s: %m", path); + close(fd); +} +#endif /* - * Issue fsync recursively on PGDATA and all its contents. + * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for + * all potential filesystem, depending on sync_after_crash setting. * * We fsync regular files and directories wherever they are, but we * follow symlinks only for pg_wal and immediately under pg_tblspc. @@ -3289,6 +3312,10 @@ void SyncDataDirectory(void) { bool xlog_is_symlink; +#ifdef HAVE_SYNCFS + DIR *dir; + struct dirent *de; +#endif /* We can skip this whole thing if fsync is disabled. */ if (!enableFsync) @@ -3317,6 +3344,39 @@ SyncDataDirectory(void) xlog_is_symlink = true; #endif +#ifdef HAVE_SYNCFS + if (sync_after_crash == SYNC_AFTER_CRASH_SYNCFS) + { + /* + * On Linux, we don't have to open every single file one by one. We + * can use syncfs() to sync whole filesystems. We only expect + * filesystem boundaries to exist where we tolerate symlinks, namely + * pg_wal and the tablespaces, so we call syncfs() for each of those + * directories. + */ + + /* Sync the top level pgdata directory. */ + do_syncfs("."); + /* If any tablespaces are configured, sync each of those. */ + dir = AllocateDir("pg_tblspc"); + while ((de = ReadDir(dir, "pg_tblspc"))) + { + char path[MAXPGPATH]; + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name); + do_syncfs(path); + } + FreeDir(dir); + /* If pg_wal is a symlink, process that too. */ + if (xlog_is_symlink) + do_syncfs("pg_wal"); + return; + } +#endif /* !HAVE_SYNCFS */ + /* * If possible, hint to the kernel that we're soon going to fsync the data * directory and its contents. Errors in this step are even less diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 855076b1fd..fea4ce6523 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -487,6 +487,14 @@ const struct config_enum_entry ssl_protocol_versions_info[] = { StaticAssertDecl(lengthof(ssl_protocol_versions_info) == (PG_TLS1_3_VERSION + 2), "array length mismatch"); +static struct config_enum_entry sync_after_crash_options[] = { + {"fsync", SYNC_AFTER_CRASH_FSYNC, false}, +#ifdef HAVE_SYNCFS + {"syncfs", SYNC_AFTER_CRASH_SYNCFS, false}, +#endif + {NULL, 0, false} +}; + static struct config_enum_entry shared_memory_options[] = { #ifndef WIN32 {"sysv", SHMEM_TYPE_SYSV, false}, @@ -4840,6 +4848,15 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"sync_after_crash", PGC_POSTMASTER, ERROR_HANDLING_OPTIONS, + gettext_noop("Sets the method for synchronizing the data directory before crash recovery."), + }, + &sync_after_crash, + SYNC_AFTER_CRASH_FSYNC, sync_after_crash_options, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index f46c2dd7a8..b9edfd4787 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -757,6 +757,7 @@ #exit_on_error = off # terminate session on any error? #restart_after_crash = on # reinitialize after backend crash? +#sync_after_crash = fsync # fsync or syncfs (Linux only, 5.8+) #data_sync_retry = off # retry or panic on failure to fsync # data? # (change requires restart) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 7a7cc21d8d..7bb5743e3d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -590,6 +590,9 @@ /* Define to 1 if you have the `symlink' function. */ #undef HAVE_SYMLINK +/* Define to 1 if you have the `syncfs' function. */ +#undef HAVE_SYNCFS + /* Define to 1 if you have the `sync_file_range' function. */ #undef HAVE_SYNC_FILE_RANGE diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 30bf7d2193..44fc1cb2a9 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -45,6 +45,13 @@ #include <dirent.h> +typedef enum SyncDataDirectoryMode { + SYNC_AFTER_CRASH_FSYNC, +#ifdef HAVE_SYNCFS + SYNC_AFTER_CRASH_SYNCFS, +#endif +} SyncDataDirectoryMode; + struct iovec; /* avoid including port/pg_iovec.h here */ typedef int File; @@ -53,6 +60,7 @@ typedef int File; /* GUC parameter */ extern PGDLLIMPORT int max_files_per_process; extern PGDLLIMPORT bool data_sync_retry; +extern int sync_after_crash; /* * This is private to fd.c, but exported for save/restore_backend_variables() -- 2.30.1