On Mon, Jul 3, 2023 at 7:47 PM Peter Eisentraut <pe...@eisentraut.org> wrote: > When we added --clone, copy_file_range() was available, but the problem > was that it was hard for the user to predict whether you'd get the fast > clone behavior or the slow copy behavior. That's the kind of thing you > want to know when planning and testing your upgrade. At the time, there > were patches passed around in Linux kernel circles that would have been > able to enforce cloning via the flags argument of copy_file_range(), but > that didn't make it to the mainline. > > So, yes, being able to specify exactly which copy mechanism to use makes > sense, so that users can choose the tradeoffs.
Thanks for looking. Yeah, it is quite inconvenient for planning purposes that it is hard for a user to know which internal strategy it uses, but that's the interface we have (and clearly "flags" is reserved for future usage so that might still evolve..). > About your patch: > > I think you should have a "check" function called from > check_new_cluster(). That check function can then also handle the "not > supported" case, and you don't need to handle that in > parseCommandLine(). I suggest following the clone example for these, > since the issues there are very similar. Done.
From 9ea1c3fc39a47f634a4fffd1ff1c9b9cf0299d65 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Fri, 2 Jun 2023 13:35:54 -0400 Subject: [PATCH v2] Add --copy-file-range option to pg_upgrade. The copy_file_range() system call is available on at least Linux and FreeBSD, and asks the kernel to use efficient ways to copy ranges of a file. Options available to the kernel include sharing block ranges (similar to --clone mode), and pushing down block copies to the storage layer. Reviewed-by: Peter Eisentraut <pe...@eisentraut.org> Discussion: https://postgr.es/m/CA%2BhUKGKe7Hb0-UNih8VD5UNZy5-ojxFb3Pr3xSBBL8qj2M2%3DdQ%40mail.gmail.com --- configure | 2 +- configure.ac | 1 + doc/src/sgml/ref/pgupgrade.sgml | 13 +++++ meson.build | 1 + src/bin/pg_upgrade/check.c | 3 ++ src/bin/pg_upgrade/file.c | 78 ++++++++++++++++++++++++++++++ src/bin/pg_upgrade/option.c | 7 ++- src/bin/pg_upgrade/pg_upgrade.h | 4 ++ src/bin/pg_upgrade/relfilenumber.c | 8 +++ src/include/pg_config.h.in | 3 ++ src/tools/msvc/Solution.pm | 1 + 11 files changed, 119 insertions(+), 2 deletions(-) diff --git a/configure b/configure index d47e0f8b26..2076b19a1b 100755 --- a/configure +++ b/configure @@ -15578,7 +15578,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l +for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 440b08d113..d0d31dd91e 100644 --- a/configure.ac +++ b/configure.ac @@ -1767,6 +1767,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` AC_CHECK_FUNCS(m4_normalize([ backtrace_symbols copyfile + copy_file_range getifaddrs getpeerucred inet_pton diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index f17fdb1ba5..8275cc067b 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -263,6 +263,19 @@ PostgreSQL documentation </listitem> </varlistentry> + <varlistentry> + <term><option>--copy-file-range</option></term> + <listitem> + <para> + Use the <function>copy_file_range</function> system call for efficient + copying. On some file systems this gives results similar to + <option>--clone</option>, sharing physical disk blocks, while on others + it may still copy blocks, but do so via an optimized path. At present, + it is supported on Linux and FreeBSD. + </para> + </listitem> + </varlistentry> + <varlistentry> <term><option>-?</option></term> <term><option>--help</option></term> diff --git a/meson.build b/meson.build index 862c955453..20e7327e9e 100644 --- a/meson.build +++ b/meson.build @@ -2415,6 +2415,7 @@ func_checks = [ ['backtrace_symbols', {'dependencies': [execinfo_dep]}], ['clock_gettime', {'dependencies': [rt_dep], 'define': false}], ['copyfile'], + ['copy_file_range'], # gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus # when enabling asan the dlopen check doesn't notice that -ldl is actually # required. Just checking for dlsym() ought to suffice. diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 21a0ff9e42..4a615edb62 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -213,6 +213,9 @@ check_new_cluster(void) break; case TRANSFER_MODE_COPY: break; + case TRANSFER_MODE_COPY_FILE_RANGE: + check_copy_file_range(); + break; case TRANSFER_MODE_LINK: check_hard_link(); break; diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index d173602882..e30d944be3 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -10,6 +10,7 @@ #include "postgres_fe.h" #include <sys/stat.h> +#include <limits.h> #include <fcntl.h> #ifdef HAVE_COPYFILE_H #include <copyfile.h> @@ -140,6 +141,45 @@ copyFile(const char *src, const char *dst, } +/* + * copyFileByRange() + * + * Copies a relation file from src to dst. + * schemaName/relName are relation's SQL name (used for error messages only). + */ +void +copyFileByRange(const char *src, const char *dst, + const char *schemaName, const char *relName) +{ +#ifdef HAVE_COPY_FILE_RANGE + int src_fd; + int dest_fd; + ssize_t nbytes; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", + schemaName, relName, src, strerror(errno)); + + if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", + schemaName, relName, dst, strerror(errno)); + + do + { + nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0); + if (nbytes < 0 && errno != EINTR) + pg_fatal("error while copying relation \"%s.%s\": could not copy file range from \"%s\" to \"%s\": %s", + schemaName, relName, src, dst, strerror(errno)); + } + while (nbytes > 0); + + close(src_fd); + close(dest_fd); +#endif +} + + /* * linkFile() * @@ -358,6 +398,44 @@ check_file_clone(void) unlink(new_link_file); } +void +check_copy_file_range(void) +{ + char existing_file[MAXPGPATH]; + char new_link_file[MAXPGPATH]; + + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata); + unlink(new_link_file); /* might fail */ + +#if defined(HAVE_COPY_FILE_RANGE) + { + int src_fd; + int dest_fd; + + if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %s", + existing_file, strerror(errno)); + + if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %s", + new_link_file, strerror(errno)); + + if (copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0) < 0) + pg_fatal("could not copy file range between old and new data directories: %s", + strerror(errno)); + + close(src_fd); + close(dest_fd); + } +#else + pg_fatal("copy_file_range not supported on this platform"); +#endif + + unlink(new_link_file); +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index b9d900d0db..600ba7e8eb 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -58,7 +58,8 @@ parseCommandLine(int argc, char *argv[]) {"verbose", no_argument, NULL, 'v'}, {"clone", no_argument, NULL, 1}, {"copy", no_argument, NULL, 2}, - {"sync-method", required_argument, NULL, 3}, + {"copy-file-range", no_argument, NULL, 3}, + {"sync-method", required_argument, NULL, 4}, {NULL, 0, NULL, 0} }; @@ -203,6 +204,9 @@ parseCommandLine(int argc, char *argv[]) break; case 3: + user_opts.transfer_mode = TRANSFER_MODE_COPY_FILE_RANGE; + break; + case 4: if (!parse_sync_method(optarg, &unused)) exit(1); user_opts.sync_method = pg_strdup(optarg); @@ -301,6 +305,7 @@ usage(void) printf(_(" -V, --version display version information, then exit\n")); printf(_(" --clone clone instead of copying files to new cluster\n")); printf(_(" --copy copy files to new cluster (default)\n")); + printf(_(" --copy-file-range copy files to new cluster with copy_file_range\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\n" diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 842f3b6cd3..25fb7dc7ad 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -234,6 +234,7 @@ typedef enum { TRANSFER_MODE_CLONE, TRANSFER_MODE_COPY, + TRANSFER_MODE_COPY_FILE_RANGE, TRANSFER_MODE_LINK } transferMode; @@ -380,11 +381,14 @@ void cloneFile(const char *src, const char *dst, const char *schemaName, const char *relName); void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName); +void copyFileByRange(const char *src, const char *dst, + const char *schemaName, const char *relName); void linkFile(const char *src, const char *dst, const char *schemaName, const char *relName); void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); void check_file_clone(void); +void check_copy_file_range(void); void check_hard_link(void); /* fopen_priv() is no longer different from fopen() */ diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 34bc9c1504..094a4db936 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -37,6 +37,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, case TRANSFER_MODE_COPY: prep_status_progress("Copying user relation files"); break; + case TRANSFER_MODE_COPY_FILE_RANGE: + prep_status_progress("Copying user relation files with copy_file_range"); + break; case TRANSFER_MODE_LINK: prep_status_progress("Linking user relation files"); break; @@ -250,6 +253,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro old_file, new_file); copyFile(old_file, new_file, map->nspname, map->relname); break; + case TRANSFER_MODE_COPY_FILE_RANGE: + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\" with copy_file_range", + old_file, new_file); + copyFileByRange(old_file, new_file, map->nspname, map->relname); + break; case TRANSFER_MODE_LINK: pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file); diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8a2985567..d787484259 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -81,6 +81,9 @@ /* Define to 1 if you have the <copyfile.h> header file. */ #undef HAVE_COPYFILE_H +/* Define to 1 if you have the `copy_file_range' function. */ +#undef HAVE_COPY_FILE_RANGE + /* Define to 1 if you have the <crtdefs.h> header file. */ #undef HAVE_CRTDEFS_H diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index a50f730260..3d72a6e4aa 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -229,6 +229,7 @@ sub GenerateFiles HAVE_COMPUTED_GOTO => undef, HAVE_COPYFILE => undef, HAVE_COPYFILE_H => undef, + HAVE_COPY_FILE_RANGE => undef, HAVE_CRTDEFS_H => undef, HAVE_CRYPTO_LOCK => undef, HAVE_DECL_FDATASYNC => 0, -- 2.42.0