I have made a major revision of this patch. I have removed all the changes to CREATE DATABASE. That was too contentious and we got lost in unrelated details there. The real benefit is for pg_upgrade.
Another point was that for pg_upgrade use a user would like to know beforehand whether reflinking would be used, which was not possible with the copy_file_range() API. So here I have switched to using the ioctl() call directly. So the new interface is that pg_upgrade has a new option --reflink={always,auto,never}. (This option name is adapted from GNU cp.) From the documentation: <para> The setting <literal>always</literal> requires the use of relinks. If they are not supported, the <application>pg_upgrade</application> run will abort. Use this in production to limit the upgrade run time. The setting <literal>auto</literal> uses reflinks when available, otherwise it falls back to a normal copy. This is the default. The setting <literal>never</literal> prevents use of reflinks and always uses a normal copy. This can be useful to ensure that the upgraded cluster has its disk space fully allocated and not shared with the old cluster. </para> Also, pg_upgrade --check will check whether the selected option would work. -- Peter Eisentraut http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
From c39e40640e70e8fc4b90e762b985201a1ce9f912 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut <pete...@gmx.net> Date: Tue, 5 Jun 2018 17:24:53 -0400 Subject: [PATCH v3] pg_upgrade: Allow use of file cloning For file copying in pg_upgrade, allow using special file cloning calls if available. This makes the copying faster and more space efficient. This achieves speed similar to --link mode without the associated drawbacks. Add an option --reflink to select whether file cloning is turned on, off, or automatic. Automatic is the default. On Linux, file cloning is supported on Btrfs and XFS (if formatted with reflink support). On macOS, file cloning is supported on APFS. --- configure | 2 +- configure.in | 2 +- doc/src/sgml/ref/pgupgrade.sgml | 33 +++++++++ src/bin/pg_upgrade/check.c | 2 + src/bin/pg_upgrade/file.c | 123 +++++++++++++++++++++++++++++++ src/bin/pg_upgrade/option.c | 14 ++++ src/bin/pg_upgrade/pg_upgrade.h | 15 ++++ src/bin/pg_upgrade/relfilenode.c | 31 +++++++- src/include/pg_config.h.in | 3 + 9 files changed, 220 insertions(+), 5 deletions(-) diff --git a/configure b/configure index 3d219c802b..a0eebf7462 100755 --- a/configure +++ b/configure @@ -14827,7 +14827,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l +for ac_func in cbrt clock_gettime copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 862d8b128d..73632bee91 100644 --- a/configure.in +++ b/configure.in @@ -1528,7 +1528,7 @@ PGAC_FUNC_WCSTOMBS_L LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) +AC_CHECK_FUNCS([cbrt clock_gettime copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 6dafb404a1..01a426f714 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -182,6 +182,39 @@ <title>Options</title> <listitem><para>display version information, then exit</para></listitem> </varlistentry> + <varlistentry> + <term><literal><option>--reflink</option>={always|auto|never}</literal></term> + <listitem> + <para> + Determines whether <application>pg_upgrade</application>, when in copy + mode, should use efficient file cloning (also known as + <quote>reflinks</quote>) on some operating systems and file systems. + This can result in near-instantaneous copying of the data files, + giving the speed advantages of + <option>-k</option>/<option>--link</option> while leaving the old + cluster untouched. + </para> + + <para> + The setting <literal>always</literal> requires the use of relinks. If + they are not supported, the <application>pg_upgrade</application> run + will abort. Use this in production to limit the upgrade run time. + The setting <literal>auto</literal> uses reflinks when available, + otherwise it falls back to a normal copy. This is the default. The + setting <literal>never</literal> prevents use of reflinks and always + uses a normal copy. This can be useful to ensure that the upgraded + cluster has its disk space fully allocated and not shared with the old + cluster. + </para> + + <para> + At present, reflinks are supported on Linux (kernel 4.5 or later) with + Btrfs and XFS (on file systems created with reflink support, which is + not the default for XFS at this writing), and on macOS with APFS. + </para> + </listitem> + </varlistentry> + <varlistentry> <term><option>-?</option></term> <term><option>--help</option></term> diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 577db73f10..0d7a67539a 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -151,6 +151,8 @@ check_new_cluster(void) if (user_opts.transfer_mode == TRANSFER_MODE_LINK) check_hard_link(); + else if (user_opts.transfer_mode == TRANSFER_MODE_COPY && user_opts.reflink_mode != REFLINK_NEVER) + check_reflink(); check_is_install_user(&new_cluster); diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index f68211aa20..7dd8106c39 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -18,6 +18,13 @@ #include <sys/stat.h> #include <fcntl.h> +#ifdef HAVE_COPYFILE +#include <copyfile.h> +#endif +#ifdef __linux__ +#include <sys/ioctl.h> +#include <linux/fs.h> +#endif #ifdef WIN32 @@ -93,6 +100,68 @@ copyFile(const char *src, const char *dst, #endif /* WIN32 */ } +/* + * cloneFile() + * + * Clones/reflinks a relation file from src to dst. + * + * schemaName/relName are relation's SQL name (used for error messages only). + * + * If unsupported_ok is true, then if the cloning fails because the OS or file + * system don't support it, don't error, instead return false. Otherwise, + * true is returned. Based on this, the caller can then try to call + * copyFile() instead, for example. + */ +bool +cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName, + bool unsupported_ok) +{ +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) + { + if (unsupported_ok && errno == ENOTSUP) + return false; + else + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } + return true; +#elif defined(__linux__) && defined(FICLONE) + int src_fd; + int dest_fd; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n", + schemaName, relName, src, strerror(errno)); + + if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n", + schemaName, relName, dst, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + unlink(dst); + if (unsupported_ok && errno == EOPNOTSUPP) + { + close(src_fd); + close(dest_fd); + return false; + } + else + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } + + close(src_fd); + close(dest_fd); + return true; +#else + return false; +#endif +} + /* * linkFile() @@ -279,6 +348,60 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, close(src_fd); } +void +check_reflink(void) +{ + char existing_file[MAXPGPATH]; + char new_link_file[MAXPGPATH]; + + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.reflinktest", new_cluster.pgdata); + unlink(new_link_file); /* might fail */ + +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) + { + if (user_opts.reflink_mode == REFLINK_ALWAYS) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); + else if (user_opts.check) + pg_log(PG_REPORT, "could not clone file between old and new data directories: %s\n", + strerror(errno)); + } +#elif defined(__linux__) && defined(FICLONE) + { + int src_fd; + int dest_fd; + + if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %s\n", + existing_file, strerror(errno)); + + if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %s\n", + new_link_file, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + if (user_opts.reflink_mode == REFLINK_ALWAYS) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); + else if (user_opts.check) + pg_log(PG_REPORT, "could not clone file between old and new data directories: %s\n", + strerror(errno)); + } + + close(src_fd); + close(dest_fd); + } +#else + pg_fatal("file cloning not supported on this platform\n"); +#endif + + unlink(new_link_file); +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 9dbc9225a6..d52a1bcee3 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -53,6 +53,9 @@ parseCommandLine(int argc, char *argv[]) {"retain", no_argument, NULL, 'r'}, {"jobs", required_argument, NULL, 'j'}, {"verbose", no_argument, NULL, 'v'}, + + {"reflink", required_argument, NULL, 1}, + {NULL, 0, NULL, 0} }; int option; /* Command line option */ @@ -203,6 +206,17 @@ parseCommandLine(int argc, char *argv[]) log_opts.verbose = true; break; + case 1: + if (strcmp(optarg, "always") == 0) + user_opts.reflink_mode = REFLINK_ALWAYS; + else if (strcmp(optarg, "auto") == 0) + user_opts.reflink_mode = REFLINK_AUTO; + else if (strcmp(optarg, "never") == 0) + user_opts.reflink_mode = REFLINK_NEVER; + else + pg_fatal("invalid reflink mode: %s\n", optarg); + break; + default: pg_fatal("Try \"%s --help\" for more information.\n", os_info.progname); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 7e5e971294..9adfc87140 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -238,6 +238,16 @@ typedef enum TRANSFER_MODE_LINK } transferMode; +/* + * Enumeration to denote reflink modes + */ +typedef enum +{ + REFLINK_NEVER, + REFLINK_AUTO, + REFLINK_ALWAYS +} reflinkMode; + /* * Enumeration to denote pg_log modes */ @@ -297,6 +307,7 @@ typedef struct bool check; /* true -> ask user for permission to make * changes */ transferMode transfer_mode; /* copy files or link them? */ + reflinkMode reflink_mode; int jobs; } UserOpts; @@ -369,10 +380,14 @@ bool pid_lock_file_exists(const char *datadir); void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName); +bool cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName, + bool unsupported_ok); void linkFile(const char *src, const char *dst, const char *schemaName, const char *relName); void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); +void check_reflink(void); void check_hard_link(void); /* fopen_priv() is no longer different from fopen() */ diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index ed604f26ca..fc00cfdfae 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -252,9 +252,34 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro } else if (user_opts.transfer_mode == TRANSFER_MODE_COPY) { - pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", - old_file, new_file); - copyFile(old_file, new_file, map->nspname, map->relname); + if (user_opts.reflink_mode == REFLINK_ALWAYS) + { + pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"\n", + old_file, new_file); + cloneFile(old_file, new_file, map->nspname, map->relname, false); + } + else if (user_opts.reflink_mode == REFLINK_AUTO) + { + static bool cloning_ok = true; + + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", + old_file, new_file); + if (cloning_ok && + !cloneFile(old_file, new_file, map->nspname, map->relname, true)) + { + pg_log(PG_VERBOSE, "cloning not supported, switching to copying\n"); + cloning_ok = false; + copyFile(old_file, new_file, map->nspname, map->relname); + } + else + copyFile(old_file, new_file, map->nspname, map->relname); + } + else + { + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", + old_file, new_file); + copyFile(old_file, new_file, map->nspname, map->relname); + } } else { diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 89b8804251..5a87a95d67 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -114,6 +114,9 @@ /* Define to 1 if your compiler handles computed gotos. */ #undef HAVE_COMPUTED_GOTO +/* Define to 1 if you have the `copyfile' function. */ +#undef HAVE_COPYFILE + /* Define to 1 if you have the <crtdefs.h> header file. */ #undef HAVE_CRTDEFS_H base-commit: 3f85c62d9e825eedd1315d249ef1ad793ca78ed4 -- 2.17.1