On Sat, Sep 3, 2016 at 10:22 PM, Michael Paquier
<michael.paqu...@gmail.com> wrote:
> On Sat, Sep 3, 2016 at 10:21 PM, Michael Paquier
> <michael.paqu...@gmail.com> wrote:
>> Oh, well. I have just implemented it on top of the two other patches
>> for pg_basebackup. For pg_receivexlog, I am wondering if it makes
>> sense to have it. That would be trivial to implement it, and I think
>> that we had better make the combination of --synchronous and --nosync
>> just leave with an error. Thoughts about having that for
>> pg_receivexlog?
>
> With patches that's actually better..

Meh, meh et meh.
-- 
Michael
From be6888046cc7dcfde33c22294e8d94a9369ff6b5 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@otacoo.com>
Date: Fri, 2 Sep 2016 15:19:11 +0900
Subject: [PATCH 1/3] Relocation fsync routines of initdb into src/common

Those are aimed at being used by other utilities, pg_basebackup mainly,
and at some other degree by pg_dump and pg_receivexlog.
---
 src/bin/initdb/initdb.c         | 266 +-------------------------------------
 src/common/Makefile             |   2 +-
 src/common/file_utils.c         | 279 ++++++++++++++++++++++++++++++++++++++++
 src/include/common/file_utils.h |  22 ++++
 src/tools/msvc/Mkvcbuild.pm     |   2 +-
 5 files changed, 310 insertions(+), 261 deletions(-)
 create mode 100644 src/common/file_utils.c
 create mode 100644 src/include/common/file_utils.h

diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 9407492..0b74ff9 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -61,6 +61,7 @@
 #endif
 
 #include "catalog/catalog.h"
+#include "common/file_utils.h"
 #include "common/restricted_token.h"
 #include "common/username.h"
 #include "mb/pg_wchar.h"
@@ -70,13 +71,6 @@
 #include "fe_utils/string_utils.h"
 
 
-/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
-#if defined(HAVE_SYNC_FILE_RANGE)
-#define PG_FLUSH_DATA_WORKS 1
-#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-#define PG_FLUSH_DATA_WORKS 1
-#endif
-
 /* Ideally this would be in a .h file, but it hardly seems worth the trouble */
 extern const char *select_default_timezone(const char *share_path);
 
@@ -237,13 +231,6 @@ static char **filter_lines_with_token(char **lines, const char *token);
 #endif
 static char **readfile(const char *path);
 static void writefile(char *path, char **lines);
-static void walkdir(const char *path,
-		void (*action) (const char *fname, bool isdir),
-		bool process_symlinks);
-#ifdef PG_FLUSH_DATA_WORKS
-static void pre_sync_fname(const char *fname, bool isdir);
-#endif
-static void fsync_fname_ext(const char *fname, bool isdir);
 static FILE *popen_check(const char *command, const char *mode);
 static void exit_nicely(void);
 static char *get_id(void);
@@ -270,7 +257,6 @@ static void load_plpgsql(FILE *cmdfd);
 static void vacuum_db(FILE *cmdfd);
 static void make_template0(FILE *cmdfd);
 static void make_postgres(FILE *cmdfd);
-static void fsync_pgdata(void);
 static void trapsig(int signum);
 static void check_ok(void);
 static char *escape_quotes(const char *src);
@@ -529,177 +515,6 @@ writefile(char *path, char **lines)
 }
 
 /*
- * walkdir: recursively walk a directory, applying the action to each
- * regular file and directory (including the named directory itself).
- *
- * If process_symlinks is true, the action and recursion are also applied
- * to regular files and directories that are pointed to by symlinks in the
- * given directory; otherwise symlinks are ignored.  Symlinks are always
- * ignored in subdirectories, ie we intentionally don't pass down the
- * process_symlinks flag to recursive calls.
- *
- * Errors are reported but not considered fatal.
- *
- * See also walkdir in fd.c, which is a backend version of this logic.
- */
-static void
-walkdir(const char *path,
-		void (*action) (const char *fname, bool isdir),
-		bool process_symlinks)
-{
-	DIR		   *dir;
-	struct dirent *de;
-
-	dir = opendir(path);
-	if (dir == NULL)
-	{
-		fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"),
-				progname, path, strerror(errno));
-		return;
-	}
-
-	while (errno = 0, (de = readdir(dir)) != NULL)
-	{
-		char		subpath[MAXPGPATH];
-		struct stat fst;
-		int			sret;
-
-		if (strcmp(de->d_name, ".") == 0 ||
-			strcmp(de->d_name, "..") == 0)
-			continue;
-
-		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
-
-		if (process_symlinks)
-			sret = stat(subpath, &fst);
-		else
-			sret = lstat(subpath, &fst);
-
-		if (sret < 0)
-		{
-			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
-					progname, subpath, strerror(errno));
-			continue;
-		}
-
-		if (S_ISREG(fst.st_mode))
-			(*action) (subpath, false);
-		else if (S_ISDIR(fst.st_mode))
-			walkdir(subpath, action, false);
-	}
-
-	if (errno)
-		fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"),
-				progname, path, strerror(errno));
-
-	(void) closedir(dir);
-
-	/*
-	 * It's important to fsync the destination directory itself as individual
-	 * file fsyncs don't guarantee that the directory entry for the file is
-	 * synced.  Recent versions of ext4 have made the window much wider but
-	 * it's been an issue for ext3 and other filesystems in the past.
-	 */
-	(*action) (path, true);
-}
-
-/*
- * Hint to the OS that it should get ready to fsync() this file.
- *
- * Ignores errors trying to open unreadable files, and reports other errors
- * non-fatally.
- */
-#ifdef PG_FLUSH_DATA_WORKS
-
-static void
-pre_sync_fname(const char *fname, bool isdir)
-{
-	int			fd;
-
-	fd = open(fname, O_RDONLY | PG_BINARY);
-
-	if (fd < 0)
-	{
-		if (errno == EACCES || (isdir && errno == EISDIR))
-			return;
-		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
-				progname, fname, strerror(errno));
-		return;
-	}
-
-	/*
-	 * We do what pg_flush_data() would do in the backend: prefer to use
-	 * sync_file_range, but fall back to posix_fadvise.  We ignore errors
-	 * because this is only a hint.
-	 */
-#if defined(HAVE_SYNC_FILE_RANGE)
-	(void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
-#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-	(void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
-#else
-#error PG_FLUSH_DATA_WORKS should not have been defined
-#endif
-
-	(void) close(fd);
-}
-
-#endif   /* PG_FLUSH_DATA_WORKS */
-
-/*
- * fsync_fname_ext -- Try to fsync a file or directory
- *
- * Ignores errors trying to open unreadable files, or trying to fsync
- * directories on systems where that isn't allowed/required.  Reports
- * other errors non-fatally.
- */
-static void
-fsync_fname_ext(const char *fname, bool isdir)
-{
-	int			fd;
-	int			flags;
-	int			returncode;
-
-	/*
-	 * Some OSs require directories to be opened read-only whereas other
-	 * systems don't allow us to fsync files opened read-only; so we need both
-	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
-	 * not writable by our userid, but we assume that's OK.
-	 */
-	flags = PG_BINARY;
-	if (!isdir)
-		flags |= O_RDWR;
-	else
-		flags |= O_RDONLY;
-
-	/*
-	 * Open the file, silently ignoring errors about unreadable files (or
-	 * unsupported operations, e.g. opening a directory under Windows), and
-	 * logging others.
-	 */
-	fd = open(fname, flags);
-	if (fd < 0)
-	{
-		if (errno == EACCES || (isdir && errno == EISDIR))
-			return;
-		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
-				progname, fname, strerror(errno));
-		return;
-	}
-
-	returncode = fsync(fd);
-
-	/*
-	 * Some OSes don't allow us to fsync directories at all, so we can ignore
-	 * those errors. Anything else needs to be reported.
-	 */
-	if (returncode != 0 && !(isdir && errno == EBADF))
-		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
-				progname, fname, strerror(errno));
-
-	(void) close(fd);
-}
-
-/*
  * Open a subcommand with suitable error messaging
  */
 static FILE *
@@ -2276,77 +2091,6 @@ make_postgres(FILE *cmdfd)
 		PG_CMD_PUTS(*line);
 }
 
-/*
- * Issue fsync recursively on PGDATA and all its contents.
- *
- * We fsync regular files and directories wherever they are, but we
- * follow symlinks only for pg_xlog and immediately under pg_tblspc.
- * Other symlinks are presumed to point at files we're not responsible
- * for fsyncing, and might not have privileges to write at all.
- *
- * Errors are reported but not considered fatal.
- */
-static void
-fsync_pgdata(void)
-{
-	bool		xlog_is_symlink;
-	char		pg_xlog[MAXPGPATH];
-	char		pg_tblspc[MAXPGPATH];
-
-	fputs(_("syncing data to disk ... "), stdout);
-	fflush(stdout);
-
-	snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", pg_data);
-	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
-
-	/*
-	 * If pg_xlog is a symlink, we'll need to recurse into it separately,
-	 * because the first walkdir below will ignore it.
-	 */
-	xlog_is_symlink = false;
-
-#ifndef WIN32
-	{
-		struct stat st;
-
-		if (lstat(pg_xlog, &st) < 0)
-			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
-					progname, pg_xlog, strerror(errno));
-		else if (S_ISLNK(st.st_mode))
-			xlog_is_symlink = true;
-	}
-#else
-	if (pgwin32_is_junction(pg_xlog))
-		xlog_is_symlink = true;
-#endif
-
-	/*
-	 * If possible, hint to the kernel that we're soon going to fsync the data
-	 * directory and its contents.
-	 */
-#ifdef PG_FLUSH_DATA_WORKS
-	walkdir(pg_data, pre_sync_fname, false);
-	if (xlog_is_symlink)
-		walkdir(pg_xlog, pre_sync_fname, false);
-	walkdir(pg_tblspc, pre_sync_fname, true);
-#endif
-
-	/*
-	 * Now we do the fsync()s in the same order.
-	 *
-	 * The main call ignores symlinks, so in addition to specially processing
-	 * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with
-	 * process_symlinks = true.  Note that if there are any plain directories
-	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
-	 * so we don't worry about optimizing it.
-	 */
-	walkdir(pg_data, fsync_fname_ext, false);
-	if (xlog_is_symlink)
-		walkdir(pg_xlog, fsync_fname_ext, false);
-	walkdir(pg_tblspc, fsync_fname_ext, true);
-
-	check_ok();
-}
 
 
 /*
@@ -3512,7 +3256,8 @@ main(int argc, char *argv[])
 			exit_nicely();
 		}
 
-		fsync_pgdata();
+		fsync_pgdata(pg_data, progname);
+		check_ok();
 		return 0;
 	}
 
@@ -3574,7 +3319,10 @@ main(int argc, char *argv[])
 	initialize_data_directory();
 
 	if (do_sync)
-		fsync_pgdata();
+	{
+		fsync_pgdata(pg_data, progname);
+		check_ok();
+	}
 	else
 		printf(_("\nSync to disk skipped.\nThe data directory might become corrupt if the operating system crashes.\n"));
 
diff --git a/src/common/Makefile b/src/common/Makefile
index a5fa649..03dfaa1 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -44,7 +44,7 @@ OBJS_COMMON = config_info.o controldata_utils.o exec.o ip.o keywords.o \
 	md5.o pg_lzcompress.o pgfnames.o psprintf.o relpath.o rmtree.o \
 	string.o username.o wait_error.o
 
-OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o restricted_token.o
+OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o
 
 OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
 
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
new file mode 100644
index 0000000..8b97dcb
--- /dev/null
+++ b/src/common/file_utils.c
@@ -0,0 +1,279 @@
+/*-------------------------------------------------------------------------
+ *
+ * File-processing utility routines.
+ *
+ * Assorted utility functions to work on files.
+ *
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/common/file_utils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "common/file_utils.h"
+
+
+/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
+#if defined(HAVE_SYNC_FILE_RANGE)
+#define PG_FLUSH_DATA_WORKS 1
+#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+#define PG_FLUSH_DATA_WORKS 1
+#endif
+
+#ifdef PG_FLUSH_DATA_WORKS
+static void pre_sync_fname(const char *fname, bool isdir,
+						   const char *progname);
+#endif
+static void walkdir(const char *path,
+	void (*action) (const char *fname, bool isdir, const char *progname),
+	bool process_symlinks, const char *progname);
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents.
+ *
+ * We fsync regular files and directories wherever they are, but we
+ * follow symlinks only for pg_xlog and immediately under pg_tblspc.
+ * Other symlinks are presumed to point at files we're not responsible
+ * for fsyncing, and might not have privileges to write at all.
+ *
+ * Errors are reported but not considered fatal.
+ */
+void
+fsync_pgdata(const char *pg_data, const char *progname)
+{
+	bool		xlog_is_symlink;
+	char		pg_xlog[MAXPGPATH];
+	char		pg_tblspc[MAXPGPATH];
+
+	fputs(_("syncing data to disk ... "), stdout);
+	fflush(stdout);
+
+	snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", pg_data);
+	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
+
+	/*
+	 * If pg_xlog is a symlink, we'll need to recurse into it separately,
+	 * because the first walkdir below will ignore it.
+	 */
+	xlog_is_symlink = false;
+
+#ifndef WIN32
+	{
+		struct stat st;
+
+		if (lstat(pg_xlog, &st) < 0)
+			fprintf(stderr, _("could not stat file \"%s\": %s\n"),
+					pg_xlog, strerror(errno));
+		else if (S_ISLNK(st.st_mode))
+			xlog_is_symlink = true;
+	}
+#else
+	if (pgwin32_is_junction(pg_xlog))
+		xlog_is_symlink = true;
+#endif
+
+	/*
+	 * If possible, hint to the kernel that we're soon going to fsync the data
+	 * directory and its contents.
+	 */
+#ifdef PG_FLUSH_DATA_WORKS
+	walkdir(pg_data, pre_sync_fname, false, progname);
+	if (xlog_is_symlink)
+		walkdir(pg_xlog, pre_sync_fname, false, progname);
+	walkdir(pg_tblspc, pre_sync_fname, true, progname);
+#endif
+
+	/*
+	 * Now we do the fsync()s in the same order.
+	 *
+	 * The main call ignores symlinks, so in addition to specially processing
+	 * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with
+	 * process_symlinks = true.  Note that if there are any plain directories
+	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
+	 * so we don't worry about optimizing it.
+	 */
+	walkdir(pg_data, fsync_fname_ext, false, progname);
+	if (xlog_is_symlink)
+		walkdir(pg_xlog, fsync_fname_ext, false, progname);
+	walkdir(pg_tblspc, fsync_fname_ext, true, progname);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself).
+ *
+ * If process_symlinks is true, the action and recursion are also applied
+ * to regular files and directories that are pointed to by symlinks in the
+ * given directory; otherwise symlinks are ignored.  Symlinks are always
+ * ignored in subdirectories, ie we intentionally don't pass down the
+ * process_symlinks flag to recursive calls.
+ *
+ * Errors are reported but not considered fatal.
+ *
+ * See also walkdir in fd.c, which is a backend version of this logic.
+ */
+static void
+walkdir(const char *path,
+		void (*action) (const char *fname, bool isdir, const char *progname),
+		bool process_symlinks, const char *progname)
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = opendir(path);
+	if (dir == NULL)
+	{
+		fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"),
+				progname, path, strerror(errno));
+		return;
+	}
+
+	while (errno = 0, (de = readdir(dir)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+		struct stat fst;
+		int			sret;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		if (process_symlinks)
+			sret = stat(subpath, &fst);
+		else
+			sret = lstat(subpath, &fst);
+
+		if (sret < 0)
+		{
+			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
+					progname, subpath, strerror(errno));
+			continue;
+		}
+
+		if (S_ISREG(fst.st_mode))
+			(*action) (subpath, false, progname);
+		else if (S_ISDIR(fst.st_mode))
+			walkdir(subpath, action, false, progname);
+	}
+
+	if (errno)
+		fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"),
+				progname, path, strerror(errno));
+
+	(void) closedir(dir);
+
+	/*
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced.  Recent versions of ext4 have made the window much wider but
+	 * it's been an issue for ext3 and other filesystems in the past.
+	 */
+	(*action) (path, true, progname);
+}
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Ignores errors trying to open unreadable files, and reports other errors
+ * non-fatally.
+ */
+#ifdef PG_FLUSH_DATA_WORKS
+
+static void
+pre_sync_fname(const char *fname, bool isdir, const char *progname)
+{
+	int			fd;
+
+	fd = open(fname, O_RDONLY | PG_BINARY);
+
+	if (fd < 0)
+	{
+		if (errno == EACCES || (isdir && errno == EISDIR))
+			return;
+		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
+				progname, fname, strerror(errno));
+		return;
+	}
+
+	/*
+	 * We do what pg_flush_data() would do in the backend: prefer to use
+	 * sync_file_range, but fall back to posix_fadvise.  We ignore errors
+	 * because this is only a hint.
+	 */
+#if defined(HAVE_SYNC_FILE_RANGE)
+	(void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
+#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+	(void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+#else
+#error PG_FLUSH_DATA_WORKS should not have been defined
+#endif
+
+	(void) close(fd);
+}
+
+#endif   /* PG_FLUSH_DATA_WORKS */
+
+/*
+ * fsync_fname_ext -- Try to fsync a file or directory
+ *
+ * Ignores errors trying to open unreadable files, or trying to fsync
+ * directories on systems where that isn't allowed/required.  Reports
+ * other errors non-fatally.
+ */
+void
+fsync_fname_ext(const char *fname, bool isdir, const char *progname)
+{
+	int			fd;
+	int			flags;
+	int			returncode;
+
+	/*
+	 * Some OSs require directories to be opened read-only whereas other
+	 * systems don't allow us to fsync files opened read-only; so we need both
+	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
+	 * not writable by our userid, but we assume that's OK.
+	 */
+	flags = PG_BINARY;
+	if (!isdir)
+		flags |= O_RDWR;
+	else
+		flags |= O_RDONLY;
+
+	/*
+	 * Open the file, silently ignoring errors about unreadable files (or
+	 * unsupported operations, e.g. opening a directory under Windows), and
+	 * logging others.
+	 */
+	fd = open(fname, flags);
+	if (fd < 0)
+	{
+		if (errno == EACCES || (isdir && errno == EISDIR))
+			return;
+		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
+				progname, fname, strerror(errno));
+		return;
+	}
+
+	returncode = fsync(fd);
+
+	/*
+	 * Some OSes don't allow us to fsync directories at all, so we can ignore
+	 * those errors. Anything else needs to be reported.
+	 */
+	if (returncode != 0 && !(isdir && errno == EBADF))
+		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
+				progname, fname, strerror(errno));
+
+	(void) close(fd);
+}
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
new file mode 100644
index 0000000..9ff7c40
--- /dev/null
+++ b/src/include/common/file_utils.h
@@ -0,0 +1,22 @@
+/*-------------------------------------------------------------------------
+ *
+ * File-processing utility routines for frontend code
+ *
+ * Assorted utility functions to work on files.
+ *
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/file_utils.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FILE_UTILS_H
+#define FILE_UTILS_H
+
+extern void fsync_fname_ext(const char *fname, bool isdir,
+							const char *progname);
+extern void fsync_pgdata(const char *pg_data, const char *progname);
+
+#endif   /* FILE_UTILS_H */
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index b3ed1f5..f9edc22 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -115,7 +115,7 @@ sub mkvcbuild
 	  string.c username.c wait_error.c);
 
 	our @pgcommonfrontendfiles = (
-		@pgcommonallfiles, qw(fe_memutils.c
+		@pgcommonallfiles, qw(file_utils.c fe_memutils.c
 		  restricted_token.c));
 
 	our @pgcommonbkndfiles = @pgcommonallfiles;
-- 
2.9.3

From 424dce3f8af832bd164456e33126eb09326e9bfa Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@otacoo.com>
Date: Fri, 2 Sep 2016 15:36:08 +0900
Subject: [PATCH 2/3] Issue fsync more carefully in pg_receivexlog and
 pg_basebackup [-X] stream.

Several places weren't careful about fsyncing in the way. See 1d4a0ab1
and 606e0f98 for details about required fsyns.

This adds a couple of routines in fe_utils which have an equivalent in the
backend:
- durable_rename
- fsync_parent_path
---
 src/bin/pg_basebackup/pg_basebackup.c | 27 ++++++++++++
 src/bin/pg_basebackup/receivelog.c    | 56 ++++++++++++++----------
 src/common/file_utils.c               | 82 ++++++++++++++++++++++++++++++++---
 src/include/common/file_utils.h       |  5 ++-
 4 files changed, 141 insertions(+), 29 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 351a420..a41b2e7 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -25,6 +25,7 @@
 #include <zlib.h>
 #endif
 
+#include "common/file_utils.h"
 #include "common/string.h"
 #include "fe_utils/string_utils.h"
 #include "getopt_long.h"
@@ -1115,6 +1116,10 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
 
 	if (copybuf != NULL)
 		PQfreemem(copybuf);
+
+	/* sync the resulting tar file, errors are not considered fatal */
+	if (strcmp(basedir, "-") != 0)
+		(void) fsync_fname_ext(filename, false, progname);
 }
 
 
@@ -1391,6 +1396,11 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum)
 
 	if (basetablespace && writerecoveryconf)
 		WriteRecoveryConf();
+
+	/*
+	 * No data is synced here, everything is done for all tablespaces at the
+	 * end.
+	 */
 }
 
 /*
@@ -1869,6 +1879,23 @@ BaseBackup(void)
 	PQclear(res);
 	PQfinish(conn);
 
+	/*
+	 * Make data persistent on disk once backup is completed. For tar
+	 * format once syncing the parent directory is fine, each tar file
+	 * created per tablespace has been already synced. In plain format,
+	 * all the data of the base directory is synced, taking into account
+	 * all the tablespaces. Errors are not considered fatal.
+	 */
+	if (format == 't')
+	{
+		if (strcmp(basedir, "-") != 0)
+			(void) fsync_fname_ext(basedir, true, progname);
+	}
+	else
+	{
+		(void) fsync_pgdata(basedir, progname);
+	}
+
 	if (verbose)
 		fprintf(stderr, "%s: base backup completed\n", progname);
 }
diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c
index 062730b..8634a91 100644
--- a/src/bin/pg_basebackup/receivelog.c
+++ b/src/bin/pg_basebackup/receivelog.c
@@ -23,6 +23,7 @@
 
 #include "libpq-fe.h"
 #include "access/xlog_internal.h"
+#include "common/file_utils.h"
 
 
 /* fd and filename for currently open WAL file */
@@ -68,17 +69,13 @@ mark_file_as_archived(const char *basedir, const char *fname)
 		return false;
 	}
 
-	if (fsync(fd) != 0)
-	{
-		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
-				progname, tmppath, strerror(errno));
-
-		close(fd);
+	close(fd);
 
+	if (fsync_fname_ext(tmppath, false, progname) != 0)
 		return false;
-	}
 
-	close(fd);
+	if (fsync_parent_path(tmppath, progname) != 0)
+		return false;
 
 	return true;
 }
@@ -116,6 +113,10 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 	/*
 	 * Verify that the file is either empty (just created), or a complete
 	 * XLogSegSize segment. Anything in between indicates a corrupt file.
+	 *
+	 * XXX: This means that we might not restart if a crash occurs before the
+	 * fsync below. We probably should create the file in a temporary path
+	 * like the backend does...
 	 */
 	if (fstat(f, &statbuf) != 0)
 	{
@@ -129,6 +130,16 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 	{
 		/* File is open and ready to use */
 		walfile = f;
+
+		/*
+		 * fsync, in case of a previous crash between padding and fsyncing the
+		 * file.
+		 */
+		if (fsync_fname_ext(fn, false, progname) != 0)
+			return false;
+		if (fsync_parent_path(fn, progname) != 0)
+			return false;
+
 		return true;
 	}
 	if (statbuf.st_size != 0)
@@ -157,6 +168,17 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 	}
 	free(zerobuf);
 
+	/*
+	 * fsync WAL file and containing directory, to ensure the file is
+	 * persistently created and zeroed. That's particularly important when
+	 * using synchronous mode, where the file is modified and fsynced
+	 * in-place, without a directory fsync.
+	 */
+	if (fsync_fname_ext(fn, false, progname) != 0)
+		return false;
+	if (fsync_parent_path(fn, progname) != 0)
+		return false;
+
 	if (lseek(f, SEEK_SET, 0) != 0)
 	{
 		fprintf(stderr,
@@ -217,10 +239,9 @@ close_walfile(StreamCtl *stream, XLogRecPtr pos)
 
 		snprintf(oldfn, sizeof(oldfn), "%s/%s%s", stream->basedir, current_walfile_name, stream->partial_suffix);
 		snprintf(newfn, sizeof(newfn), "%s/%s", stream->basedir, current_walfile_name);
-		if (rename(oldfn, newfn) != 0)
+		if (durable_rename(oldfn, newfn, progname) != 0)
 		{
-			fprintf(stderr, _("%s: could not rename file \"%s\": %s\n"),
-					progname, current_walfile_name, strerror(errno));
+			/* durable_rename produced a log entry */
 			return false;
 		}
 	}
@@ -338,14 +359,6 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content)
 		return false;
 	}
 
-	if (fsync(fd) != 0)
-	{
-		close(fd);
-		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
-				progname, tmppath, strerror(errno));
-		return false;
-	}
-
 	if (close(fd) != 0)
 	{
 		fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
@@ -356,10 +369,9 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content)
 	/*
 	 * Now move the completed history file into place with its final name.
 	 */
-	if (rename(tmppath, path) < 0)
+	if (durable_rename(tmppath, path, progname) < 0)
 	{
-		fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"),
-				progname, tmppath, path, strerror(errno));
+		/* durable_rename produced a log entry */
 		return false;
 	}
 
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 8b97dcb..3a6e7db 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -34,7 +34,7 @@ static void pre_sync_fname(const char *fname, bool isdir,
 						   const char *progname);
 #endif
 static void walkdir(const char *path,
-	void (*action) (const char *fname, bool isdir, const char *progname),
+	int (*action) (const char *fname, bool isdir, const char *progname),
 	bool process_symlinks, const char *progname);
 
 /*
@@ -54,7 +54,7 @@ fsync_pgdata(const char *pg_data, const char *progname)
 	char		pg_xlog[MAXPGPATH];
 	char		pg_tblspc[MAXPGPATH];
 
-	fputs(_("syncing data to disk ... "), stdout);
+	fputs(_("syncing data to disk ...\n"), stdout);
 	fflush(stdout);
 
 	snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", pg_data);
@@ -123,7 +123,7 @@ fsync_pgdata(const char *pg_data, const char *progname)
  */
 static void
 walkdir(const char *path,
-		void (*action) (const char *fname, bool isdir, const char *progname),
+		int (*action) (const char *fname, bool isdir, const char *progname),
 		bool process_symlinks, const char *progname)
 {
 	DIR		   *dir;
@@ -231,7 +231,7 @@ pre_sync_fname(const char *fname, bool isdir, const char *progname)
  * directories on systems where that isn't allowed/required.  Reports
  * other errors non-fatally.
  */
-void
+int
 fsync_fname_ext(const char *fname, bool isdir, const char *progname)
 {
 	int			fd;
@@ -259,10 +259,10 @@ fsync_fname_ext(const char *fname, bool isdir, const char *progname)
 	if (fd < 0)
 	{
 		if (errno == EACCES || (isdir && errno == EISDIR))
-			return;
+			return 0;
 		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
 				progname, fname, strerror(errno));
-		return;
+		return -1;
 	}
 
 	returncode = fsync(fd);
@@ -272,8 +272,78 @@ fsync_fname_ext(const char *fname, bool isdir, const char *progname)
 	 * those errors. Anything else needs to be reported.
 	 */
 	if (returncode != 0 && !(isdir && errno == EBADF))
+	{
 		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
 				progname, fname, strerror(errno));
+		return -1;
+	}
 
 	(void) close(fd);
+	return 0;
+}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+int
+fsync_parent_path(const char *fname, const char *progname)
+{
+	char		parentpath[MAXPGPATH];
+
+	strlcpy(parentpath, fname, MAXPGPATH);
+	get_parent_directory(parentpath);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument is
+	 * just a file name (see comments in path.c), so handle that as being the
+	 * current directory.
+	 */
+	if (strlen(parentpath) == 0)
+		strlcpy(parentpath, ".", MAXPGPATH);
+
+	if (fsync_fname_ext(parentpath, true, progname) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ *
+ * Wrapper around rename, similar to the backend version.  Note that this
+ * version does not fsync the target file before the rename, as it's unlikely
+ * to be helpful for current and prospective users.
+ */
+int
+durable_rename(const char *oldfile, const char *newfile, const char *progname)
+{
+	/*
+	 * First fsync the old path, to ensure that it is properly persistent on
+	 * disk.
+	 */
+	if (fsync_fname_ext(oldfile, false, progname) != 0)
+		return -1;
+
+	/* Time to do the real deal... */
+	if (rename(oldfile, newfile) != 0)
+	{
+		fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"),
+				progname, oldfile, newfile, strerror(errno));
+		return -1;
+	}
+
+	/*
+	 * To guarantee renaming the file is persistent, fsync the file with its
+	 * new name, and its containing directory.
+	 */
+	if (fsync_fname_ext(newfile, false, progname) != 0)
+		return -1;
+
+	if (fsync_parent_path(newfile, progname) != 0)
+		return -1;
+
+	return 0;
 }
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 9ff7c40..4e5d8b3 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -15,8 +15,11 @@
 #ifndef FILE_UTILS_H
 #define FILE_UTILS_H
 
-extern void fsync_fname_ext(const char *fname, bool isdir,
+extern int fsync_fname_ext(const char *fname, bool isdir,
 							const char *progname);
 extern void fsync_pgdata(const char *pg_data, const char *progname);
+extern int durable_rename(const char *oldfile, const char *newfile,
+						  const char *progname);
+extern int fsync_parent_path(const char *fname, const char *progname);
 
 #endif   /* FILE_UTILS_H */
-- 
2.9.3

From 96ea7b7bf5484940258c8caef38a1301c1f0fd28 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@otacoo.com>
Date: Sat, 3 Sep 2016 22:18:22 +0900
Subject: [PATCH 3/3] Add --nosync option to pg_basebackup

This makes pg_basebackup, and is useful for testing.
---
 doc/src/sgml/ref/pg_basebackup.sgml    | 15 +++++++++++++++
 src/bin/pg_basebackup/pg_basebackup.c  | 28 +++++++++++++++++++---------
 src/bin/pg_basebackup/pg_receivexlog.c |  1 +
 src/bin/pg_basebackup/receivelog.c     | 20 +++++++++++---------
 src/bin/pg_basebackup/receivelog.h     |  4 +++-
 5 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index 03615da..50195fd 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -421,6 +421,21 @@ PostgreSQL documentation
      </varlistentry>
 
      <varlistentry>
+      <term><option>-N</option></term>
+      <term><option>--nosync</option></term>
+      <listitem>
+       <para>
+        By default, <command>pg_basebackup</command> will wait for all files
+        to be written safely to disk.  This option causes
+        <command>pg_basebackup</command> to return without waiting, which is
+        faster, but means that a subsequent operating system crash can leave
+        the base backup corrupt.  Generally, this option is useful for testing
+        but should not be used when creating a production installation.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-v</option></term>
       <term><option>--verbose</option></term>
       <listitem>
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index a41b2e7..b3d891c 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -66,6 +66,7 @@ static bool includewal = false;
 static bool streamwal = false;
 static bool fastcheckpoint = false;
 static bool writerecoveryconf = false;
+static bool do_sync = true;
 static int	standby_message_timeout = 10 * 1000;		/* 10 sec = default */
 static pg_time_t last_progress_report = 0;
 static int32 maxrate = 0;		/* no limit by default */
@@ -254,6 +255,7 @@ usage(void)
 	printf(_("  -c, --checkpoint=fast|spread\n"
 			 "                         set fast or spread checkpointing\n"));
 	printf(_("  -l, --label=LABEL      set backup label\n"));
+	printf(_("  -N, --nosync           do not wait for changes to be written safely to disk\n"));
 	printf(_("  -P, --progress         show progress information\n"));
 	printf(_("  -v, --verbose          output verbose messages\n"));
 	printf(_("  -V, --version          output version information, then exit\n"));
@@ -383,6 +385,7 @@ LogStreamerMain(logstreamer_param *param)
 	stream.stream_stop = reached_end_position;
 	stream.standby_message_timeout = standby_message_timeout;
 	stream.synchronous = false;
+	stream.do_sync = do_sync;
 	stream.mark_done = true;
 	stream.basedir = param->xlogdir;
 	stream.partial_suffix = NULL;
@@ -1118,7 +1121,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
 		PQfreemem(copybuf);
 
 	/* sync the resulting tar file, errors are not considered fatal */
-	if (strcmp(basedir, "-") != 0)
+	if (do_sync && strcmp(basedir, "-") != 0)
 		(void) fsync_fname_ext(filename, false, progname);
 }
 
@@ -1886,14 +1889,17 @@ BaseBackup(void)
 	 * all the data of the base directory is synced, taking into account
 	 * all the tablespaces. Errors are not considered fatal.
 	 */
-	if (format == 't')
+	if (do_sync)
 	{
-		if (strcmp(basedir, "-") != 0)
-			(void) fsync_fname_ext(basedir, true, progname);
-	}
-	else
-	{
-		(void) fsync_pgdata(basedir, progname);
+		if (format == 't')
+		{
+			if (strcmp(basedir, "-") != 0)
+				(void) fsync_fname_ext(basedir, true, progname);
+		}
+		else
+		{
+			(void) fsync_pgdata(basedir, progname);
+		}
 	}
 
 	if (verbose)
@@ -1928,6 +1934,7 @@ main(int argc, char **argv)
 		{"status-interval", required_argument, NULL, 's'},
 		{"verbose", no_argument, NULL, 'v'},
 		{"progress", no_argument, NULL, 'P'},
+		{"nosync", no_argument, NULL, 'N'},
 		{"xlogdir", required_argument, NULL, 1},
 		{NULL, 0, NULL, 0}
 	};
@@ -1953,7 +1960,7 @@ main(int argc, char **argv)
 		}
 	}
 
-	while ((c = getopt_long(argc, argv, "D:F:r:RT:xX:l:zZ:d:c:h:p:U:s:S:wWvP",
+	while ((c = getopt_long(argc, argv, "D:F:r:RT:xX:l:NzZ:d:c:h:p:U:s:S:wWvP",
 							long_options, &option_index)) != -1)
 	{
 		switch (c)
@@ -1977,6 +1984,9 @@ main(int argc, char **argv)
 			case 'r':
 				maxrate = parse_max_rate(optarg);
 				break;
+			case 'N':
+				do_sync = false;
+				break;
 			case 'R':
 				writerecoveryconf = true;
 				break;
diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c
index 7f7ee9d..a58a251 100644
--- a/src/bin/pg_basebackup/pg_receivexlog.c
+++ b/src/bin/pg_basebackup/pg_receivexlog.c
@@ -336,6 +336,7 @@ StreamLog(void)
 	stream.stream_stop = stop_streaming;
 	stream.standby_message_timeout = standby_message_timeout;
 	stream.synchronous = synchronous;
+	stream.do_sync = true;
 	stream.mark_done = false;
 	stream.basedir = basedir;
 	stream.partial_suffix = ".partial";
diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c
index 8634a91..f7a4397 100644
--- a/src/bin/pg_basebackup/receivelog.c
+++ b/src/bin/pg_basebackup/receivelog.c
@@ -53,7 +53,7 @@ static bool ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos,
 						 uint32 *timeline);
 
 static bool
-mark_file_as_archived(const char *basedir, const char *fname)
+mark_file_as_archived(const char *basedir, const char *fname, bool do_sync)
 {
 	int			fd;
 	static char tmppath[MAXPGPATH];
@@ -71,10 +71,10 @@ mark_file_as_archived(const char *basedir, const char *fname)
 
 	close(fd);
 
-	if (fsync_fname_ext(tmppath, false, progname) != 0)
+	if (do_sync && fsync_fname_ext(tmppath, false, progname) != 0)
 		return false;
 
-	if (fsync_parent_path(tmppath, progname) != 0)
+	if (do_sync && fsync_parent_path(tmppath, progname) != 0)
 		return false;
 
 	return true;
@@ -135,9 +135,9 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 		 * fsync, in case of a previous crash between padding and fsyncing the
 		 * file.
 		 */
-		if (fsync_fname_ext(fn, false, progname) != 0)
+		if (stream->do_sync && fsync_fname_ext(fn, false, progname) != 0)
 			return false;
-		if (fsync_parent_path(fn, progname) != 0)
+		if (stream->do_sync && fsync_parent_path(fn, progname) != 0)
 			return false;
 
 		return true;
@@ -174,9 +174,9 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 	 * using synchronous mode, where the file is modified and fsynced
 	 * in-place, without a directory fsync.
 	 */
-	if (fsync_fname_ext(fn, false, progname) != 0)
+	if (stream->do_sync && fsync_fname_ext(fn, false, progname) != 0)
 		return false;
-	if (fsync_parent_path(fn, progname) != 0)
+	if (stream->do_sync && fsync_parent_path(fn, progname) != 0)
 		return false;
 
 	if (lseek(f, SEEK_SET, 0) != 0)
@@ -259,7 +259,8 @@ close_walfile(StreamCtl *stream, XLogRecPtr pos)
 	if (currpos == XLOG_SEG_SIZE && stream->mark_done)
 	{
 		/* writes error message if failed */
-		if (!mark_file_as_archived(stream->basedir, current_walfile_name))
+		if (!mark_file_as_archived(stream->basedir, current_walfile_name,
+								   stream->do_sync))
 			return false;
 	}
 
@@ -379,7 +380,8 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content)
 	if (stream->mark_done)
 	{
 		/* writes error message if failed */
-		if (!mark_file_as_archived(stream->basedir, histfname))
+		if (!mark_file_as_archived(stream->basedir, histfname,
+								   stream->do_sync))
 			return false;
 	}
 
diff --git a/src/bin/pg_basebackup/receivelog.h b/src/bin/pg_basebackup/receivelog.h
index 554ff8b..7a3bbc5 100644
--- a/src/bin/pg_basebackup/receivelog.h
+++ b/src/bin/pg_basebackup/receivelog.h
@@ -34,8 +34,10 @@ typedef struct StreamCtl
 								 * timeline */
 	int			standby_message_timeout;		/* Send status messages this
 												 * often */
-	bool		synchronous;	/* Flush data on write */
+	bool		synchronous;	/* Flush immediately WAL data on write */
 	bool		mark_done;		/* Mark segment as done in generated archive */
+	bool		do_sync;		/* Flush to disk to ensure consistent state
+								 * of data */
 
 	stream_stop_callback stream_stop;	/* Stop streaming when returns true */
 
-- 
2.9.3

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to