Hello,

I was just in a pg_upgrade unconference session at PGCon where the
lack of $SUBJECT came up.  This system call gives the kernel the
option to use fast block cloning on XFS, ZFS (as of very recently),
etc, and works on Linux and FreeBSD.  It's probably much the same as
--clone mode on COW file systems, except that is Linux-only.  On
overwrite file systems (ie not copy-on-write, like ext4), it may also
be able to push copies down to storage hardware/network file systems.

There was something like this in the nearby large files patch set, but
in that version it just magically did it when available in --copy
mode.  Now I think the user should have to have to opt in with
--copy-file-range, and simply to error out if it fails.  It may not
work in some cases -- for example, the man page says that older Linux
systems can fail with EXDEV when you try to copy across file systems,
while newer systems will do something less efficient but still
sensible internally; also I saw a claim that some older versions had
weird bugs.  Better to just expose the raw functionality and let users
say when they want it and read the error if it fail, I think.
From 571e68a2948c5bff9fa1d66f382c859fc6606829 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 2 Jun 2023 13:35:54 -0400
Subject: [PATCH] Add --copy-file-range option to pg_upgrade.

The copy_file_range() system call is available on at least Linux and
FreeBSD, and asks the kernel to use efficient ways to copy ranges of a
file.  Options available to the kernel include sharing block ranges
(similar to --clone mode), and pushing down block copies to the storage
layer.
---
 configure                          |  2 +-
 configure.ac                       |  1 +
 doc/src/sgml/ref/pgupgrade.sgml    | 13 +++++++++
 meson.build                        |  1 +
 src/bin/pg_upgrade/check.c         |  1 +
 src/bin/pg_upgrade/file.c          | 43 ++++++++++++++++++++++++++++++
 src/bin/pg_upgrade/option.c        | 10 +++++++
 src/bin/pg_upgrade/pg_upgrade.h    |  3 +++
 src/bin/pg_upgrade/relfilenumber.c |  8 ++++++
 src/include/pg_config.h.in         |  3 +++
 src/tools/msvc/Solution.pm         |  1 +
 11 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index 1b415142d1..a620e049fa 100755
--- a/configure
+++ b/configure
@@ -15700,7 +15700,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 09558ada0f..69b9256037 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1794,6 +1794,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 AC_CHECK_FUNCS(m4_normalize([
 	backtrace_symbols
 	copyfile
+	copy_file_range
 	getifaddrs
 	getpeerucred
 	inet_pton
diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index 7816b4c685..9180513307 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -240,6 +240,19 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--copy-file-range</option></term>
+      <listitem>
+       <para>
+        Use the <function>copy_file_range</function> system call for efficient
+        copying.  On some file systems this gives results similar to
+        <option>--clone</option>, sharing physical disk blocks, while on others
+        it may still copy blocks, but do so via an optimized path.  At present,
+        it is supported on Linux and FreeBSD.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-?</option></term>
       <term><option>--help</option></term>
diff --git a/meson.build b/meson.build
index 16b2e86646..322d8f822d 100644
--- a/meson.build
+++ b/meson.build
@@ -2404,6 +2404,7 @@ func_checks = [
   ['backtrace_symbols', {'dependencies': [execinfo_dep]}],
   ['clock_gettime', {'dependencies': [rt_dep, posix4_dep], 'define': false}],
   ['copyfile'],
+  ['copy_file_range'],
   # gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus
   # when enabling asan the dlopen check doesn't notice that -ldl is actually
   # required. Just checking for dlsym() ought to suffice.
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 64024e3b9e..8c4e56a568 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -199,6 +199,7 @@ check_new_cluster(void)
 			check_file_clone();
 			break;
 		case TRANSFER_MODE_COPY:
+		case TRANSFER_MODE_COPY_FILE_RANGE:
 			break;
 		case TRANSFER_MODE_LINK:
 			check_hard_link();
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index d173602882..d8f123bba6 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -10,6 +10,7 @@
 #include "postgres_fe.h"
 
 #include <sys/stat.h>
+#include <limits.h>
 #include <fcntl.h>
 #ifdef HAVE_COPYFILE_H
 #include <copyfile.h>
@@ -140,6 +141,48 @@ copyFile(const char *src, const char *dst,
 }
 
 
+/*
+ * copyFileByRange()
+ *
+ * Copies a relation file from src to dst.
+ * schemaName/relName are relation's SQL name (used for error messages only).
+ */
+void
+copyFileByRange(const char *src, const char *dst,
+				const char *schemaName, const char *relName)
+{
+#ifdef HAVE_COPY_FILE_RANGE
+	int			src_fd;
+	int			dest_fd;
+	ssize_t		nbytes;
+
+	if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+		pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
+				 schemaName, relName, src, strerror(errno));
+
+	if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+						pg_file_create_mode)) < 0)
+		pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
+				 schemaName, relName, dst, strerror(errno));
+
+	for (;;)
+	{
+		nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+		if (nbytes < 0 && errno != EINTR)
+			pg_fatal("error while copying relation \"%s.%s\": could not copy file range from \"%s\" to \"%s\": %s",
+					 schemaName, relName, src, dst, strerror(errno));
+		if (nbytes == 0)
+			break;
+	}
+
+	close(src_fd);
+	close(dest_fd);
+#else
+	pg_fatal("copy_file_range not supported on this platform");
+#endif
+}
+
+
 /*
  * linkFile()
  *
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index 640361009e..0734508a2b 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -57,6 +57,7 @@ parseCommandLine(int argc, char *argv[])
 		{"verbose", no_argument, NULL, 'v'},
 		{"clone", no_argument, NULL, 1},
 		{"copy", no_argument, NULL, 2},
+		{"copy-file-range", no_argument, NULL, 3},
 
 		{NULL, 0, NULL, 0}
 	};
@@ -199,6 +200,14 @@ parseCommandLine(int argc, char *argv[])
 				user_opts.transfer_mode = TRANSFER_MODE_COPY;
 				break;
 
+			case 3:
+#ifdef HAVE_COPY_FILE_RANGE
+				user_opts.transfer_mode = TRANSFER_MODE_COPY_FILE_RANGE;
+#else
+				pg_fatal("copy_file_range not available on this platform");
+#endif
+				break;
+
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
 						os_info.progname);
@@ -289,6 +298,7 @@ usage(void)
 	printf(_("  -V, --version                 display version information, then exit\n"));
 	printf(_("  --clone                       clone instead of copying files to new cluster\n"));
 	printf(_("  --copy                        copy files to new cluster (default)\n"));
+	printf(_("  --copy-file-range             copy files to new cluster with copy_file_range\n"));
 	printf(_("  -?, --help                    show this help, then exit\n"));
 	printf(_("\n"
 			 "Before running pg_upgrade you must:\n"
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 3eea0139c7..a4cb14a49f 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -234,6 +234,7 @@ typedef enum
 {
 	TRANSFER_MODE_CLONE,
 	TRANSFER_MODE_COPY,
+	TRANSFER_MODE_COPY_FILE_RANGE,
 	TRANSFER_MODE_LINK
 } transferMode;
 
@@ -379,6 +380,8 @@ void		cloneFile(const char *src, const char *dst,
 					  const char *schemaName, const char *relName);
 void		copyFile(const char *src, const char *dst,
 					 const char *schemaName, const char *relName);
+void		copyFileByRange(const char *src, const char *dst,
+							const char *schemaName, const char *relName);
 void		linkFile(const char *src, const char *dst,
 					 const char *schemaName, const char *relName);
 void		rewriteVisibilityMap(const char *fromfile, const char *tofile,
diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c
index 34bc9c1504..094a4db936 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -37,6 +37,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
 		case TRANSFER_MODE_COPY:
 			prep_status_progress("Copying user relation files");
 			break;
+		case TRANSFER_MODE_COPY_FILE_RANGE:
+			prep_status_progress("Copying user relation files with copy_file_range");
+			break;
 		case TRANSFER_MODE_LINK:
 			prep_status_progress("Linking user relation files");
 			break;
@@ -250,6 +253,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
 						   old_file, new_file);
 					copyFile(old_file, new_file, map->nspname, map->relname);
 					break;
+				case TRANSFER_MODE_COPY_FILE_RANGE:
+					pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\" with copy_file_range",
+						   old_file, new_file);
+					copyFileByRange(old_file, new_file, map->nspname, map->relname);
+					break;
 				case TRANSFER_MODE_LINK:
 					pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"",
 						   old_file, new_file);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6d572c3820..0b26836f68 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -85,6 +85,9 @@
 /* Define to 1 if you have the <copyfile.h> header file. */
 #undef HAVE_COPYFILE_H
 
+/* Define to 1 if you have the `copy_file_range' function. */
+#undef HAVE_COPY_FILE_RANGE
+
 /* Define to 1 if you have the <crtdefs.h> header file. */
 #undef HAVE_CRTDEFS_H
 
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index b6d31c3583..733376a87e 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -230,6 +230,7 @@ sub GenerateFiles
 		HAVE_COMPUTED_GOTO => undef,
 		HAVE_COPYFILE => undef,
 		HAVE_COPYFILE_H => undef,
+		HAVE_COPY_FILE_RANGE => undef,
 		HAVE_CRTDEFS_H => undef,
 		HAVE_CRYPTO_LOCK => undef,
 		HAVE_DECL_FDATASYNC => 0,
-- 
2.39.2

Reply via email to