On Sat, Sep 3, 2016 at 10:22 PM, Michael Paquier <michael.paqu...@gmail.com> wrote: > On Sat, Sep 3, 2016 at 10:21 PM, Michael Paquier > <michael.paqu...@gmail.com> wrote: >> Oh, well. I have just implemented it on top of the two other patches >> for pg_basebackup. For pg_receivexlog, I am wondering if it makes >> sense to have it. That would be trivial to implement it, and I think >> that we had better make the combination of --synchronous and --nosync >> just leave with an error. Thoughts about having that for >> pg_receivexlog? > > With patches that's actually better..
Meh, meh et meh. -- Michael
From be6888046cc7dcfde33c22294e8d94a9369ff6b5 Mon Sep 17 00:00:00 2001 From: Michael Paquier <michael@otacoo.com> Date: Fri, 2 Sep 2016 15:19:11 +0900 Subject: [PATCH 1/3] Relocation fsync routines of initdb into src/common Those are aimed at being used by other utilities, pg_basebackup mainly, and at some other degree by pg_dump and pg_receivexlog. --- src/bin/initdb/initdb.c | 266 +------------------------------------- src/common/Makefile | 2 +- src/common/file_utils.c | 279 ++++++++++++++++++++++++++++++++++++++++ src/include/common/file_utils.h | 22 ++++ src/tools/msvc/Mkvcbuild.pm | 2 +- 5 files changed, 310 insertions(+), 261 deletions(-) create mode 100644 src/common/file_utils.c create mode 100644 src/include/common/file_utils.h diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 9407492..0b74ff9 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -61,6 +61,7 @@ #endif #include "catalog/catalog.h" +#include "common/file_utils.h" #include "common/restricted_token.h" #include "common/username.h" #include "mb/pg_wchar.h" @@ -70,13 +71,6 @@ #include "fe_utils/string_utils.h" -/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ -#if defined(HAVE_SYNC_FILE_RANGE) -#define PG_FLUSH_DATA_WORKS 1 -#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) -#define PG_FLUSH_DATA_WORKS 1 -#endif - /* Ideally this would be in a .h file, but it hardly seems worth the trouble */ extern const char *select_default_timezone(const char *share_path); @@ -237,13 +231,6 @@ static char **filter_lines_with_token(char **lines, const char *token); #endif static char **readfile(const char *path); static void writefile(char *path, char **lines); -static void walkdir(const char *path, - void (*action) (const char *fname, bool isdir), - bool process_symlinks); -#ifdef PG_FLUSH_DATA_WORKS -static void pre_sync_fname(const char *fname, bool isdir); -#endif -static void fsync_fname_ext(const char *fname, bool isdir); static FILE *popen_check(const char *command, const char *mode); static void exit_nicely(void); static char *get_id(void); @@ -270,7 +257,6 @@ static void load_plpgsql(FILE *cmdfd); static void vacuum_db(FILE *cmdfd); static void make_template0(FILE *cmdfd); static void make_postgres(FILE *cmdfd); -static void fsync_pgdata(void); static void trapsig(int signum); static void check_ok(void); static char *escape_quotes(const char *src); @@ -529,177 +515,6 @@ writefile(char *path, char **lines) } /* - * walkdir: recursively walk a directory, applying the action to each - * regular file and directory (including the named directory itself). - * - * If process_symlinks is true, the action and recursion are also applied - * to regular files and directories that are pointed to by symlinks in the - * given directory; otherwise symlinks are ignored. Symlinks are always - * ignored in subdirectories, ie we intentionally don't pass down the - * process_symlinks flag to recursive calls. - * - * Errors are reported but not considered fatal. - * - * See also walkdir in fd.c, which is a backend version of this logic. - */ -static void -walkdir(const char *path, - void (*action) (const char *fname, bool isdir), - bool process_symlinks) -{ - DIR *dir; - struct dirent *de; - - dir = opendir(path); - if (dir == NULL) - { - fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"), - progname, path, strerror(errno)); - return; - } - - while (errno = 0, (de = readdir(dir)) != NULL) - { - char subpath[MAXPGPATH]; - struct stat fst; - int sret; - - if (strcmp(de->d_name, ".") == 0 || - strcmp(de->d_name, "..") == 0) - continue; - - snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); - - if (process_symlinks) - sret = stat(subpath, &fst); - else - sret = lstat(subpath, &fst); - - if (sret < 0) - { - fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"), - progname, subpath, strerror(errno)); - continue; - } - - if (S_ISREG(fst.st_mode)) - (*action) (subpath, false); - else if (S_ISDIR(fst.st_mode)) - walkdir(subpath, action, false); - } - - if (errno) - fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"), - progname, path, strerror(errno)); - - (void) closedir(dir); - - /* - * It's important to fsync the destination directory itself as individual - * file fsyncs don't guarantee that the directory entry for the file is - * synced. Recent versions of ext4 have made the window much wider but - * it's been an issue for ext3 and other filesystems in the past. - */ - (*action) (path, true); -} - -/* - * Hint to the OS that it should get ready to fsync() this file. - * - * Ignores errors trying to open unreadable files, and reports other errors - * non-fatally. - */ -#ifdef PG_FLUSH_DATA_WORKS - -static void -pre_sync_fname(const char *fname, bool isdir) -{ - int fd; - - fd = open(fname, O_RDONLY | PG_BINARY); - - if (fd < 0) - { - if (errno == EACCES || (isdir && errno == EISDIR)) - return; - fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), - progname, fname, strerror(errno)); - return; - } - - /* - * We do what pg_flush_data() would do in the backend: prefer to use - * sync_file_range, but fall back to posix_fadvise. We ignore errors - * because this is only a hint. - */ -#if defined(HAVE_SYNC_FILE_RANGE) - (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); -#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); -#else -#error PG_FLUSH_DATA_WORKS should not have been defined -#endif - - (void) close(fd); -} - -#endif /* PG_FLUSH_DATA_WORKS */ - -/* - * fsync_fname_ext -- Try to fsync a file or directory - * - * Ignores errors trying to open unreadable files, or trying to fsync - * directories on systems where that isn't allowed/required. Reports - * other errors non-fatally. - */ -static void -fsync_fname_ext(const char *fname, bool isdir) -{ - int fd; - int flags; - int returncode; - - /* - * Some OSs require directories to be opened read-only whereas other - * systems don't allow us to fsync files opened read-only; so we need both - * cases here. Using O_RDWR will cause us to fail to fsync files that are - * not writable by our userid, but we assume that's OK. - */ - flags = PG_BINARY; - if (!isdir) - flags |= O_RDWR; - else - flags |= O_RDONLY; - - /* - * Open the file, silently ignoring errors about unreadable files (or - * unsupported operations, e.g. opening a directory under Windows), and - * logging others. - */ - fd = open(fname, flags); - if (fd < 0) - { - if (errno == EACCES || (isdir && errno == EISDIR)) - return; - fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), - progname, fname, strerror(errno)); - return; - } - - returncode = fsync(fd); - - /* - * Some OSes don't allow us to fsync directories at all, so we can ignore - * those errors. Anything else needs to be reported. - */ - if (returncode != 0 && !(isdir && errno == EBADF)) - fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), - progname, fname, strerror(errno)); - - (void) close(fd); -} - -/* * Open a subcommand with suitable error messaging */ static FILE * @@ -2276,77 +2091,6 @@ make_postgres(FILE *cmdfd) PG_CMD_PUTS(*line); } -/* - * Issue fsync recursively on PGDATA and all its contents. - * - * We fsync regular files and directories wherever they are, but we - * follow symlinks only for pg_xlog and immediately under pg_tblspc. - * Other symlinks are presumed to point at files we're not responsible - * for fsyncing, and might not have privileges to write at all. - * - * Errors are reported but not considered fatal. - */ -static void -fsync_pgdata(void) -{ - bool xlog_is_symlink; - char pg_xlog[MAXPGPATH]; - char pg_tblspc[MAXPGPATH]; - - fputs(_("syncing data to disk ... "), stdout); - fflush(stdout); - - snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", pg_data); - snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data); - - /* - * If pg_xlog is a symlink, we'll need to recurse into it separately, - * because the first walkdir below will ignore it. - */ - xlog_is_symlink = false; - -#ifndef WIN32 - { - struct stat st; - - if (lstat(pg_xlog, &st) < 0) - fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"), - progname, pg_xlog, strerror(errno)); - else if (S_ISLNK(st.st_mode)) - xlog_is_symlink = true; - } -#else - if (pgwin32_is_junction(pg_xlog)) - xlog_is_symlink = true; -#endif - - /* - * If possible, hint to the kernel that we're soon going to fsync the data - * directory and its contents. - */ -#ifdef PG_FLUSH_DATA_WORKS - walkdir(pg_data, pre_sync_fname, false); - if (xlog_is_symlink) - walkdir(pg_xlog, pre_sync_fname, false); - walkdir(pg_tblspc, pre_sync_fname, true); -#endif - - /* - * Now we do the fsync()s in the same order. - * - * The main call ignores symlinks, so in addition to specially processing - * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with - * process_symlinks = true. Note that if there are any plain directories - * in pg_tblspc, they'll get fsync'd twice. That's not an expected case - * so we don't worry about optimizing it. - */ - walkdir(pg_data, fsync_fname_ext, false); - if (xlog_is_symlink) - walkdir(pg_xlog, fsync_fname_ext, false); - walkdir(pg_tblspc, fsync_fname_ext, true); - - check_ok(); -} /* @@ -3512,7 +3256,8 @@ main(int argc, char *argv[]) exit_nicely(); } - fsync_pgdata(); + fsync_pgdata(pg_data, progname); + check_ok(); return 0; } @@ -3574,7 +3319,10 @@ main(int argc, char *argv[]) initialize_data_directory(); if (do_sync) - fsync_pgdata(); + { + fsync_pgdata(pg_data, progname); + check_ok(); + } else printf(_("\nSync to disk skipped.\nThe data directory might become corrupt if the operating system crashes.\n")); diff --git a/src/common/Makefile b/src/common/Makefile index a5fa649..03dfaa1 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -44,7 +44,7 @@ OBJS_COMMON = config_info.o controldata_utils.o exec.o ip.o keywords.o \ md5.o pg_lzcompress.o pgfnames.o psprintf.o relpath.o rmtree.o \ string.o username.o wait_error.o -OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o restricted_token.o +OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o) diff --git a/src/common/file_utils.c b/src/common/file_utils.c new file mode 100644 index 0000000..8b97dcb --- /dev/null +++ b/src/common/file_utils.c @@ -0,0 +1,279 @@ +/*------------------------------------------------------------------------- + * + * File-processing utility routines. + * + * Assorted utility functions to work on files. + * + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/common/file_utils.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <dirent.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "common/file_utils.h" + + +/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ +#if defined(HAVE_SYNC_FILE_RANGE) +#define PG_FLUSH_DATA_WORKS 1 +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) +#define PG_FLUSH_DATA_WORKS 1 +#endif + +#ifdef PG_FLUSH_DATA_WORKS +static void pre_sync_fname(const char *fname, bool isdir, + const char *progname); +#endif +static void walkdir(const char *path, + void (*action) (const char *fname, bool isdir, const char *progname), + bool process_symlinks, const char *progname); + +/* + * Issue fsync recursively on PGDATA and all its contents. + * + * We fsync regular files and directories wherever they are, but we + * follow symlinks only for pg_xlog and immediately under pg_tblspc. + * Other symlinks are presumed to point at files we're not responsible + * for fsyncing, and might not have privileges to write at all. + * + * Errors are reported but not considered fatal. + */ +void +fsync_pgdata(const char *pg_data, const char *progname) +{ + bool xlog_is_symlink; + char pg_xlog[MAXPGPATH]; + char pg_tblspc[MAXPGPATH]; + + fputs(_("syncing data to disk ... "), stdout); + fflush(stdout); + + snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", pg_data); + snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data); + + /* + * If pg_xlog is a symlink, we'll need to recurse into it separately, + * because the first walkdir below will ignore it. + */ + xlog_is_symlink = false; + +#ifndef WIN32 + { + struct stat st; + + if (lstat(pg_xlog, &st) < 0) + fprintf(stderr, _("could not stat file \"%s\": %s\n"), + pg_xlog, strerror(errno)); + else if (S_ISLNK(st.st_mode)) + xlog_is_symlink = true; + } +#else + if (pgwin32_is_junction(pg_xlog)) + xlog_is_symlink = true; +#endif + + /* + * If possible, hint to the kernel that we're soon going to fsync the data + * directory and its contents. + */ +#ifdef PG_FLUSH_DATA_WORKS + walkdir(pg_data, pre_sync_fname, false, progname); + if (xlog_is_symlink) + walkdir(pg_xlog, pre_sync_fname, false, progname); + walkdir(pg_tblspc, pre_sync_fname, true, progname); +#endif + + /* + * Now we do the fsync()s in the same order. + * + * The main call ignores symlinks, so in addition to specially processing + * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with + * process_symlinks = true. Note that if there are any plain directories + * in pg_tblspc, they'll get fsync'd twice. That's not an expected case + * so we don't worry about optimizing it. + */ + walkdir(pg_data, fsync_fname_ext, false, progname); + if (xlog_is_symlink) + walkdir(pg_xlog, fsync_fname_ext, false, progname); + walkdir(pg_tblspc, fsync_fname_ext, true, progname); +} + +/* + * walkdir: recursively walk a directory, applying the action to each + * regular file and directory (including the named directory itself). + * + * If process_symlinks is true, the action and recursion are also applied + * to regular files and directories that are pointed to by symlinks in the + * given directory; otherwise symlinks are ignored. Symlinks are always + * ignored in subdirectories, ie we intentionally don't pass down the + * process_symlinks flag to recursive calls. + * + * Errors are reported but not considered fatal. + * + * See also walkdir in fd.c, which is a backend version of this logic. + */ +static void +walkdir(const char *path, + void (*action) (const char *fname, bool isdir, const char *progname), + bool process_symlinks, const char *progname) +{ + DIR *dir; + struct dirent *de; + + dir = opendir(path); + if (dir == NULL) + { + fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"), + progname, path, strerror(errno)); + return; + } + + while (errno = 0, (de = readdir(dir)) != NULL) + { + char subpath[MAXPGPATH]; + struct stat fst; + int sret; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); + + if (process_symlinks) + sret = stat(subpath, &fst); + else + sret = lstat(subpath, &fst); + + if (sret < 0) + { + fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"), + progname, subpath, strerror(errno)); + continue; + } + + if (S_ISREG(fst.st_mode)) + (*action) (subpath, false, progname); + else if (S_ISDIR(fst.st_mode)) + walkdir(subpath, action, false, progname); + } + + if (errno) + fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"), + progname, path, strerror(errno)); + + (void) closedir(dir); + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. Recent versions of ext4 have made the window much wider but + * it's been an issue for ext3 and other filesystems in the past. + */ + (*action) (path, true, progname); +} + +/* + * Hint to the OS that it should get ready to fsync() this file. + * + * Ignores errors trying to open unreadable files, and reports other errors + * non-fatally. + */ +#ifdef PG_FLUSH_DATA_WORKS + +static void +pre_sync_fname(const char *fname, bool isdir, const char *progname) +{ + int fd; + + fd = open(fname, O_RDONLY | PG_BINARY); + + if (fd < 0) + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), + progname, fname, strerror(errno)); + return; + } + + /* + * We do what pg_flush_data() would do in the backend: prefer to use + * sync_file_range, but fall back to posix_fadvise. We ignore errors + * because this is only a hint. + */ +#if defined(HAVE_SYNC_FILE_RANGE) + (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +#else +#error PG_FLUSH_DATA_WORKS should not have been defined +#endif + + (void) close(fd); +} + +#endif /* PG_FLUSH_DATA_WORKS */ + +/* + * fsync_fname_ext -- Try to fsync a file or directory + * + * Ignores errors trying to open unreadable files, or trying to fsync + * directories on systems where that isn't allowed/required. Reports + * other errors non-fatally. + */ +void +fsync_fname_ext(const char *fname, bool isdir, const char *progname) +{ + int fd; + int flags; + int returncode; + + /* + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here. Using O_RDWR will cause us to fail to fsync files that are + * not writable by our userid, but we assume that's OK. + */ + flags = PG_BINARY; + if (!isdir) + flags |= O_RDWR; + else + flags |= O_RDONLY; + + /* + * Open the file, silently ignoring errors about unreadable files (or + * unsupported operations, e.g. opening a directory under Windows), and + * logging others. + */ + fd = open(fname, flags); + if (fd < 0) + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), + progname, fname, strerror(errno)); + return; + } + + returncode = fsync(fd); + + /* + * Some OSes don't allow us to fsync directories at all, so we can ignore + * those errors. Anything else needs to be reported. + */ + if (returncode != 0 && !(isdir && errno == EBADF)) + fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), + progname, fname, strerror(errno)); + + (void) close(fd); +} diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h new file mode 100644 index 0000000..9ff7c40 --- /dev/null +++ b/src/include/common/file_utils.h @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * + * File-processing utility routines for frontend code + * + * Assorted utility functions to work on files. + * + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/file_utils.h + * + *------------------------------------------------------------------------- + */ +#ifndef FILE_UTILS_H +#define FILE_UTILS_H + +extern void fsync_fname_ext(const char *fname, bool isdir, + const char *progname); +extern void fsync_pgdata(const char *pg_data, const char *progname); + +#endif /* FILE_UTILS_H */ diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index b3ed1f5..f9edc22 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -115,7 +115,7 @@ sub mkvcbuild string.c username.c wait_error.c); our @pgcommonfrontendfiles = ( - @pgcommonallfiles, qw(fe_memutils.c + @pgcommonallfiles, qw(file_utils.c fe_memutils.c restricted_token.c)); our @pgcommonbkndfiles = @pgcommonallfiles; -- 2.9.3
From 424dce3f8af832bd164456e33126eb09326e9bfa Mon Sep 17 00:00:00 2001 From: Michael Paquier <michael@otacoo.com> Date: Fri, 2 Sep 2016 15:36:08 +0900 Subject: [PATCH 2/3] Issue fsync more carefully in pg_receivexlog and pg_basebackup [-X] stream. Several places weren't careful about fsyncing in the way. See 1d4a0ab1 and 606e0f98 for details about required fsyns. This adds a couple of routines in fe_utils which have an equivalent in the backend: - durable_rename - fsync_parent_path --- src/bin/pg_basebackup/pg_basebackup.c | 27 ++++++++++++ src/bin/pg_basebackup/receivelog.c | 56 ++++++++++++++---------- src/common/file_utils.c | 82 ++++++++++++++++++++++++++++++++--- src/include/common/file_utils.h | 5 ++- 4 files changed, 141 insertions(+), 29 deletions(-) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 351a420..a41b2e7 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -25,6 +25,7 @@ #include <zlib.h> #endif +#include "common/file_utils.h" #include "common/string.h" #include "fe_utils/string_utils.h" #include "getopt_long.h" @@ -1115,6 +1116,10 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum) if (copybuf != NULL) PQfreemem(copybuf); + + /* sync the resulting tar file, errors are not considered fatal */ + if (strcmp(basedir, "-") != 0) + (void) fsync_fname_ext(filename, false, progname); } @@ -1391,6 +1396,11 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum) if (basetablespace && writerecoveryconf) WriteRecoveryConf(); + + /* + * No data is synced here, everything is done for all tablespaces at the + * end. + */ } /* @@ -1869,6 +1879,23 @@ BaseBackup(void) PQclear(res); PQfinish(conn); + /* + * Make data persistent on disk once backup is completed. For tar + * format once syncing the parent directory is fine, each tar file + * created per tablespace has been already synced. In plain format, + * all the data of the base directory is synced, taking into account + * all the tablespaces. Errors are not considered fatal. + */ + if (format == 't') + { + if (strcmp(basedir, "-") != 0) + (void) fsync_fname_ext(basedir, true, progname); + } + else + { + (void) fsync_pgdata(basedir, progname); + } + if (verbose) fprintf(stderr, "%s: base backup completed\n", progname); } diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 062730b..8634a91 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -23,6 +23,7 @@ #include "libpq-fe.h" #include "access/xlog_internal.h" +#include "common/file_utils.h" /* fd and filename for currently open WAL file */ @@ -68,17 +69,13 @@ mark_file_as_archived(const char *basedir, const char *fname) return false; } - if (fsync(fd) != 0) - { - fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), - progname, tmppath, strerror(errno)); - - close(fd); + close(fd); + if (fsync_fname_ext(tmppath, false, progname) != 0) return false; - } - close(fd); + if (fsync_parent_path(tmppath, progname) != 0) + return false; return true; } @@ -116,6 +113,10 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) /* * Verify that the file is either empty (just created), or a complete * XLogSegSize segment. Anything in between indicates a corrupt file. + * + * XXX: This means that we might not restart if a crash occurs before the + * fsync below. We probably should create the file in a temporary path + * like the backend does... */ if (fstat(f, &statbuf) != 0) { @@ -129,6 +130,16 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) { /* File is open and ready to use */ walfile = f; + + /* + * fsync, in case of a previous crash between padding and fsyncing the + * file. + */ + if (fsync_fname_ext(fn, false, progname) != 0) + return false; + if (fsync_parent_path(fn, progname) != 0) + return false; + return true; } if (statbuf.st_size != 0) @@ -157,6 +168,17 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) } free(zerobuf); + /* + * fsync WAL file and containing directory, to ensure the file is + * persistently created and zeroed. That's particularly important when + * using synchronous mode, where the file is modified and fsynced + * in-place, without a directory fsync. + */ + if (fsync_fname_ext(fn, false, progname) != 0) + return false; + if (fsync_parent_path(fn, progname) != 0) + return false; + if (lseek(f, SEEK_SET, 0) != 0) { fprintf(stderr, @@ -217,10 +239,9 @@ close_walfile(StreamCtl *stream, XLogRecPtr pos) snprintf(oldfn, sizeof(oldfn), "%s/%s%s", stream->basedir, current_walfile_name, stream->partial_suffix); snprintf(newfn, sizeof(newfn), "%s/%s", stream->basedir, current_walfile_name); - if (rename(oldfn, newfn) != 0) + if (durable_rename(oldfn, newfn, progname) != 0) { - fprintf(stderr, _("%s: could not rename file \"%s\": %s\n"), - progname, current_walfile_name, strerror(errno)); + /* durable_rename produced a log entry */ return false; } } @@ -338,14 +359,6 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content) return false; } - if (fsync(fd) != 0) - { - close(fd); - fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), - progname, tmppath, strerror(errno)); - return false; - } - if (close(fd) != 0) { fprintf(stderr, _("%s: could not close file \"%s\": %s\n"), @@ -356,10 +369,9 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content) /* * Now move the completed history file into place with its final name. */ - if (rename(tmppath, path) < 0) + if (durable_rename(tmppath, path, progname) < 0) { - fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"), - progname, tmppath, path, strerror(errno)); + /* durable_rename produced a log entry */ return false; } diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 8b97dcb..3a6e7db 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -34,7 +34,7 @@ static void pre_sync_fname(const char *fname, bool isdir, const char *progname); #endif static void walkdir(const char *path, - void (*action) (const char *fname, bool isdir, const char *progname), + int (*action) (const char *fname, bool isdir, const char *progname), bool process_symlinks, const char *progname); /* @@ -54,7 +54,7 @@ fsync_pgdata(const char *pg_data, const char *progname) char pg_xlog[MAXPGPATH]; char pg_tblspc[MAXPGPATH]; - fputs(_("syncing data to disk ... "), stdout); + fputs(_("syncing data to disk ...\n"), stdout); fflush(stdout); snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", pg_data); @@ -123,7 +123,7 @@ fsync_pgdata(const char *pg_data, const char *progname) */ static void walkdir(const char *path, - void (*action) (const char *fname, bool isdir, const char *progname), + int (*action) (const char *fname, bool isdir, const char *progname), bool process_symlinks, const char *progname) { DIR *dir; @@ -231,7 +231,7 @@ pre_sync_fname(const char *fname, bool isdir, const char *progname) * directories on systems where that isn't allowed/required. Reports * other errors non-fatally. */ -void +int fsync_fname_ext(const char *fname, bool isdir, const char *progname) { int fd; @@ -259,10 +259,10 @@ fsync_fname_ext(const char *fname, bool isdir, const char *progname) if (fd < 0) { if (errno == EACCES || (isdir && errno == EISDIR)) - return; + return 0; fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), progname, fname, strerror(errno)); - return; + return -1; } returncode = fsync(fd); @@ -272,8 +272,78 @@ fsync_fname_ext(const char *fname, bool isdir, const char *progname) * those errors. Anything else needs to be reported. */ if (returncode != 0 && !(isdir && errno == EBADF)) + { fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"), progname, fname, strerror(errno)); + return -1; + } (void) close(fd); + return 0; +} + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +int +fsync_parent_path(const char *fname, const char *progname) +{ + char parentpath[MAXPGPATH]; + + strlcpy(parentpath, fname, MAXPGPATH); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument is + * just a file name (see comments in path.c), so handle that as being the + * current directory. + */ + if (strlen(parentpath) == 0) + strlcpy(parentpath, ".", MAXPGPATH); + + if (fsync_fname_ext(parentpath, true, progname) != 0) + return -1; + + return 0; +} + +/* + * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability + * + * Wrapper around rename, similar to the backend version. Note that this + * version does not fsync the target file before the rename, as it's unlikely + * to be helpful for current and prospective users. + */ +int +durable_rename(const char *oldfile, const char *newfile, const char *progname) +{ + /* + * First fsync the old path, to ensure that it is properly persistent on + * disk. + */ + if (fsync_fname_ext(oldfile, false, progname) != 0) + return -1; + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) != 0) + { + fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"), + progname, oldfile, newfile, strerror(errno)); + return -1; + } + + /* + * To guarantee renaming the file is persistent, fsync the file with its + * new name, and its containing directory. + */ + if (fsync_fname_ext(newfile, false, progname) != 0) + return -1; + + if (fsync_parent_path(newfile, progname) != 0) + return -1; + + return 0; } diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 9ff7c40..4e5d8b3 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -15,8 +15,11 @@ #ifndef FILE_UTILS_H #define FILE_UTILS_H -extern void fsync_fname_ext(const char *fname, bool isdir, +extern int fsync_fname_ext(const char *fname, bool isdir, const char *progname); extern void fsync_pgdata(const char *pg_data, const char *progname); +extern int durable_rename(const char *oldfile, const char *newfile, + const char *progname); +extern int fsync_parent_path(const char *fname, const char *progname); #endif /* FILE_UTILS_H */ -- 2.9.3
From 96ea7b7bf5484940258c8caef38a1301c1f0fd28 Mon Sep 17 00:00:00 2001 From: Michael Paquier <michael@otacoo.com> Date: Sat, 3 Sep 2016 22:18:22 +0900 Subject: [PATCH 3/3] Add --nosync option to pg_basebackup This makes pg_basebackup, and is useful for testing. --- doc/src/sgml/ref/pg_basebackup.sgml | 15 +++++++++++++++ src/bin/pg_basebackup/pg_basebackup.c | 28 +++++++++++++++++++--------- src/bin/pg_basebackup/pg_receivexlog.c | 1 + src/bin/pg_basebackup/receivelog.c | 20 +++++++++++--------- src/bin/pg_basebackup/receivelog.h | 4 +++- 5 files changed, 49 insertions(+), 19 deletions(-) diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index 03615da..50195fd 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -421,6 +421,21 @@ PostgreSQL documentation </varlistentry> <varlistentry> + <term><option>-N</option></term> + <term><option>--nosync</option></term> + <listitem> + <para> + By default, <command>pg_basebackup</command> will wait for all files + to be written safely to disk. This option causes + <command>pg_basebackup</command> to return without waiting, which is + faster, but means that a subsequent operating system crash can leave + the base backup corrupt. Generally, this option is useful for testing + but should not be used when creating a production installation. + </para> + </listitem> + </varlistentry> + + <varlistentry> <term><option>-v</option></term> <term><option>--verbose</option></term> <listitem> diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index a41b2e7..b3d891c 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -66,6 +66,7 @@ static bool includewal = false; static bool streamwal = false; static bool fastcheckpoint = false; static bool writerecoveryconf = false; +static bool do_sync = true; static int standby_message_timeout = 10 * 1000; /* 10 sec = default */ static pg_time_t last_progress_report = 0; static int32 maxrate = 0; /* no limit by default */ @@ -254,6 +255,7 @@ usage(void) printf(_(" -c, --checkpoint=fast|spread\n" " set fast or spread checkpointing\n")); printf(_(" -l, --label=LABEL set backup label\n")); + printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n")); printf(_(" -P, --progress show progress information\n")); printf(_(" -v, --verbose output verbose messages\n")); printf(_(" -V, --version output version information, then exit\n")); @@ -383,6 +385,7 @@ LogStreamerMain(logstreamer_param *param) stream.stream_stop = reached_end_position; stream.standby_message_timeout = standby_message_timeout; stream.synchronous = false; + stream.do_sync = do_sync; stream.mark_done = true; stream.basedir = param->xlogdir; stream.partial_suffix = NULL; @@ -1118,7 +1121,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum) PQfreemem(copybuf); /* sync the resulting tar file, errors are not considered fatal */ - if (strcmp(basedir, "-") != 0) + if (do_sync && strcmp(basedir, "-") != 0) (void) fsync_fname_ext(filename, false, progname); } @@ -1886,14 +1889,17 @@ BaseBackup(void) * all the data of the base directory is synced, taking into account * all the tablespaces. Errors are not considered fatal. */ - if (format == 't') + if (do_sync) { - if (strcmp(basedir, "-") != 0) - (void) fsync_fname_ext(basedir, true, progname); - } - else - { - (void) fsync_pgdata(basedir, progname); + if (format == 't') + { + if (strcmp(basedir, "-") != 0) + (void) fsync_fname_ext(basedir, true, progname); + } + else + { + (void) fsync_pgdata(basedir, progname); + } } if (verbose) @@ -1928,6 +1934,7 @@ main(int argc, char **argv) {"status-interval", required_argument, NULL, 's'}, {"verbose", no_argument, NULL, 'v'}, {"progress", no_argument, NULL, 'P'}, + {"nosync", no_argument, NULL, 'N'}, {"xlogdir", required_argument, NULL, 1}, {NULL, 0, NULL, 0} }; @@ -1953,7 +1960,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "D:F:r:RT:xX:l:zZ:d:c:h:p:U:s:S:wWvP", + while ((c = getopt_long(argc, argv, "D:F:r:RT:xX:l:NzZ:d:c:h:p:U:s:S:wWvP", long_options, &option_index)) != -1) { switch (c) @@ -1977,6 +1984,9 @@ main(int argc, char **argv) case 'r': maxrate = parse_max_rate(optarg); break; + case 'N': + do_sync = false; + break; case 'R': writerecoveryconf = true; break; diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c index 7f7ee9d..a58a251 100644 --- a/src/bin/pg_basebackup/pg_receivexlog.c +++ b/src/bin/pg_basebackup/pg_receivexlog.c @@ -336,6 +336,7 @@ StreamLog(void) stream.stream_stop = stop_streaming; stream.standby_message_timeout = standby_message_timeout; stream.synchronous = synchronous; + stream.do_sync = true; stream.mark_done = false; stream.basedir = basedir; stream.partial_suffix = ".partial"; diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 8634a91..f7a4397 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -53,7 +53,7 @@ static bool ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos, uint32 *timeline); static bool -mark_file_as_archived(const char *basedir, const char *fname) +mark_file_as_archived(const char *basedir, const char *fname, bool do_sync) { int fd; static char tmppath[MAXPGPATH]; @@ -71,10 +71,10 @@ mark_file_as_archived(const char *basedir, const char *fname) close(fd); - if (fsync_fname_ext(tmppath, false, progname) != 0) + if (do_sync && fsync_fname_ext(tmppath, false, progname) != 0) return false; - if (fsync_parent_path(tmppath, progname) != 0) + if (do_sync && fsync_parent_path(tmppath, progname) != 0) return false; return true; @@ -135,9 +135,9 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) * fsync, in case of a previous crash between padding and fsyncing the * file. */ - if (fsync_fname_ext(fn, false, progname) != 0) + if (stream->do_sync && fsync_fname_ext(fn, false, progname) != 0) return false; - if (fsync_parent_path(fn, progname) != 0) + if (stream->do_sync && fsync_parent_path(fn, progname) != 0) return false; return true; @@ -174,9 +174,9 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint) * using synchronous mode, where the file is modified and fsynced * in-place, without a directory fsync. */ - if (fsync_fname_ext(fn, false, progname) != 0) + if (stream->do_sync && fsync_fname_ext(fn, false, progname) != 0) return false; - if (fsync_parent_path(fn, progname) != 0) + if (stream->do_sync && fsync_parent_path(fn, progname) != 0) return false; if (lseek(f, SEEK_SET, 0) != 0) @@ -259,7 +259,8 @@ close_walfile(StreamCtl *stream, XLogRecPtr pos) if (currpos == XLOG_SEG_SIZE && stream->mark_done) { /* writes error message if failed */ - if (!mark_file_as_archived(stream->basedir, current_walfile_name)) + if (!mark_file_as_archived(stream->basedir, current_walfile_name, + stream->do_sync)) return false; } @@ -379,7 +380,8 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content) if (stream->mark_done) { /* writes error message if failed */ - if (!mark_file_as_archived(stream->basedir, histfname)) + if (!mark_file_as_archived(stream->basedir, histfname, + stream->do_sync)) return false; } diff --git a/src/bin/pg_basebackup/receivelog.h b/src/bin/pg_basebackup/receivelog.h index 554ff8b..7a3bbc5 100644 --- a/src/bin/pg_basebackup/receivelog.h +++ b/src/bin/pg_basebackup/receivelog.h @@ -34,8 +34,10 @@ typedef struct StreamCtl * timeline */ int standby_message_timeout; /* Send status messages this * often */ - bool synchronous; /* Flush data on write */ + bool synchronous; /* Flush immediately WAL data on write */ bool mark_done; /* Mark segment as done in generated archive */ + bool do_sync; /* Flush to disk to ensure consistent state + * of data */ stream_stop_callback stream_stop; /* Stop streaming when returns true */ -- 2.9.3
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers