Here is another attempt at implementing file cloning for pg_upgrade and
CREATE DATABASE.  The idea is to take advantage of file systems that can
make copy-on-write clones, which would make the copy run much faster.
For pg_upgrade, this will give the performance of --link mode without
the associated drawbacks.

There have been patches proposed previously [0][1].  The concerns there
were mainly that they required a Linux-specific ioctl() call and only
worked for Btrfs.

Some new things have happened since then:

- XFS has (optional) reflink support.  This file system is probably more
widely used than Btrfs.

- Linux and glibc have a proper function to do this now.

- APFS on macOS supports file cloning.

So altogether this feature will be more widely usable and less ugly to
implement.  Note, however, that you will currently need literally the
latest glibc release, so it probably won't be accessible right now
unless you are using Fedora 28 for example.  (This is the
copy_file_range() function that had us recently rename the same function
in pg_rewind.)

Some example measurements:

6 GB database, pg_upgrade unpatched 30 seconds, patched 3 seconds (XFS
and APFS)

similar for a CREATE DATABASE from a large template

Even if you don't have a file system with cloning support, the special
library calls make copying faster.  For example, on APFS, in this
example, an unpatched CREATE DATABASE takes 30 seconds, with the library
call (but without cloning) it takes 10 seconds.

For amusement/bewilderment, without the recent flush optimization on
APFS, this takes 2 minutes 30 seconds.  I suppose this optimization will
now actually obsolete, since macOS will no longer hit that code.


[0]:
https://www.postgresql.org/message-id/flat/513C0E7C.5080606%40socialserve.com

[1]:
https://www.postgresql.org/message-id/flat/20140213030731.GE4831%40momjian.us
-- 
Peter Eisentraut              http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
From 56b5b574f6d900d5eb4932be499cf3bae0e7ba86 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pete...@gmx.net>
Date: Tue, 20 Feb 2018 10:41:16 -0500
Subject: [PATCH] Use file cloning in pg_upgrade and CREATE DATABASE

For file copying in pg_upgrade and CREATE DATABASE, use special file
cloning calls if available.  This makes the copying faster and more
space efficient.  For pg_upgrade, this achieves speed similar to --link
mode without the associated drawbacks.

On Linux, use copy_file_range().  This supports file cloning
automatically on Btrfs and XFS (if formatted with reflink support).  On
macOS, use copyfile(), which supports file cloning on APFS.

Even on file systems without cloning/reflink support, this is faster
than the existing code, because it avoids copying the file contents out
of kernel space and allows the OS to apply other optimizations.
---
 configure                          |  2 +-
 configure.in                       |  2 +-
 doc/src/sgml/ref/pgupgrade.sgml    | 11 ++++++++
 src/backend/storage/file/copydir.c | 55 +++++++++++++++++++++++++++++++++-----
 src/bin/pg_upgrade/file.c          | 37 ++++++++++++++++++++++++-
 src/include/pg_config.h.in         |  6 +++++
 6 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/configure b/configure
index 7dcca506f8..eb8b321723 100755
--- a/configure
+++ b/configure
@@ -13079,7 +13079,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred 
getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np 
readlink setproctitle setsid shm_open symlink sync_file_range utime utimes 
wcstombs_l
+for ac_func in cbrt clock_gettime copy_file_range copyfile dlopen fdatasync 
getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat 
pthread_is_threaded_np readlink setproctitle setsid shm_open symlink 
sync_file_range utime utimes wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.in b/configure.in
index 4d26034579..dfe3507b25 100644
--- a/configure.in
+++ b/configure.in
@@ -1425,7 +1425,7 @@ PGAC_FUNC_WCSTOMBS_L
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-AC_CHECK_FUNCS([cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred 
getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np 
readlink setproctitle setsid shm_open symlink sync_file_range utime utimes 
wcstombs_l])
+AC_CHECK_FUNCS([cbrt clock_gettime copy_file_range copyfile dlopen fdatasync 
getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat 
pthread_is_threaded_np readlink setproctitle setsid shm_open symlink 
sync_file_range utime utimes wcstombs_l])
 
 AC_REPLACE_FUNCS(fseeko)
 case $host_os in
diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index 6dafb404a1..3873e71dd1 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -737,6 +737,17 @@ <title>Notes</title>
    is down.
   </para>
 
+  <para>
+   In PostgreSQL 11 and later, <application>pg_upgrade</application>
+   automatically uses efficient file cloning (also known as
+   <quote>reflinks</quote>) on some operating systems and file systems.  This
+   can result in near-instantaneous copying of the data files, giving the
+   speed advantages of <option>-k</option>/<option>--link</option> while
+   leaving the old cluster untouched.  At present, this is supported on Linux
+   (kernel 4.5 or later, glibc 2.27 or later) with Btrfs and XFS (on file
+   systems created with reflink support, which is not the default for XFS at
+   this writing), and on macOS with APFS.
+  </para>
  </refsect1>
 
  <refsect1>
diff --git a/src/backend/storage/file/copydir.c 
b/src/backend/storage/file/copydir.c
index ca6342db0d..cd6398d69a 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,6 +21,9 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/stat.h>
+#ifdef HAVE_COPYFILE
+#include <copyfile.h>
+#endif
 
 #include "storage/copydir.h"
 #include "storage/fd.h"
@@ -74,7 +77,22 @@ copydir(char *fromdir, char *todir, bool recurse)
                                copydir(fromfile, tofile, true);
                }
                else if (S_ISREG(fst.st_mode))
+               {
+#ifdef HAVE_COPYFILE
+                       if (copyfile(fromfile, tofile, NULL,
+#ifdef COPYFILE_CLONE
+                                                COPYFILE_CLONE
+#else
+                                                COPYFILE_DATA
+#endif
+                                       ) < 0)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not copy file 
\"%s\" to \"%s\": %m", fromfile, tofile)));
+#else
                        copy_file(fromfile, tofile);
+#endif
+               }
        }
        FreeDir(xldir);
 
@@ -126,12 +144,17 @@ copydir(char *fromdir, char *todir, bool recurse)
 void
 copy_file(char *fromfile, char *tofile)
 {
-       char       *buffer;
        int                     srcfd;
        int                     dstfd;
+#ifdef HAVE_COPY_FILE_RANGE
+       struct stat stat;
+       size_t          len;
+#else
+       char       *buffer;
        int                     nbytes;
        off_t           offset;
        off_t           flush_offset;
+#endif
 
        /* Size of copy buffer (read and write requests) */
 #define COPY_BUF_SIZE (8 * BLCKSZ)
@@ -148,9 +171,6 @@ copy_file(char *fromfile, char *tofile)
 #define FLUSH_DISTANCE (1024 * 1024)
 #endif
 
-       /* Use palloc to ensure we get a maxaligned buffer */
-       buffer = palloc(COPY_BUF_SIZE);
-
        /*
         * Open the files
         */
@@ -166,6 +186,28 @@ copy_file(char *fromfile, char *tofile)
                                (errcode_for_file_access(),
                                 errmsg("could not create file \"%s\": %m", 
tofile)));
 
+#ifdef HAVE_COPY_FILE_RANGE
+       if (fstat(srcfd, &stat) < 0)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not stat file \"%s\": %m", 
fromfile)));
+
+       len = stat.st_size;
+
+       do {
+               ssize_t ret = copy_file_range(srcfd, NULL, dstfd, NULL, len, 0);
+               if (ret < 0)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not copy file \"%s\" to 
\"%s\": %m",
+                                                       fromfile, tofile)));
+
+               len -= ret;
+       } while (len > 0);
+#else
+       /* Use palloc to ensure we get a maxaligned buffer */
+       buffer = palloc(COPY_BUF_SIZE);
+
        /*
         * Do the data copying.
         */
@@ -213,12 +255,13 @@ copy_file(char *fromfile, char *tofile)
        if (offset > flush_offset)
                pg_flush_data(dstfd, flush_offset, offset - flush_offset);
 
+       pfree(buffer);
+#endif
+
        if (CloseTransientFile(dstfd))
                ereport(ERROR,
                                (errcode_for_file_access(),
                                 errmsg("could not close file \"%s\": %m", 
tofile)));
 
        CloseTransientFile(srcfd);
-
-       pfree(buffer);
 }
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index f38bfacf02..f05fd9db9c 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -17,6 +17,9 @@
 
 #include <sys/stat.h>
 #include <fcntl.h>
+#ifdef HAVE_COPYFILE
+#include <copyfile.h>
+#endif
 
 
 #ifdef WIN32
@@ -34,10 +37,25 @@ void
 copyFile(const char *src, const char *dst,
                 const char *schemaName, const char *relName)
 {
-#ifndef WIN32
+#ifdef HAVE_COPYFILE
+       if (copyfile(src, dst, NULL,
+#ifdef COPYFILE_CLONE
+                                COPYFILE_CLONE
+#else
+                                COPYFILE_DATA
+#endif
+                       ) < 0)
+               pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to 
\"%s\"): %s\n",
+                                schemaName, relName, src, dst, 
strerror(errno));
+#elif !defined(WIN32)
        int                     src_fd;
        int                     dest_fd;
+#ifdef HAVE_COPY_FILE_RANGE
+       struct stat stat;
+       size_t          len;
+#else
        char       *buffer;
+#endif
 
        if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
                pg_fatal("error while copying relation \"%s.%s\": could not 
open file \"%s\": %s\n",
@@ -48,6 +66,22 @@ copyFile(const char *src, const char *dst,
                pg_fatal("error while copying relation \"%s.%s\": could not 
create file \"%s\": %s\n",
                                 schemaName, relName, dst, strerror(errno));
 
+#ifdef HAVE_COPY_FILE_RANGE
+       if (fstat(src_fd, &stat) < 0)
+               pg_fatal("could not stat file \"%s\": %s",
+                                src, strerror(errno));
+
+       len = stat.st_size;
+
+       do {
+               ssize_t ret = copy_file_range(src_fd, NULL, dest_fd, NULL, len, 
0);
+               if (ret < 0)
+                       pg_fatal("error while copying relation \"%s.%s\" 
(\"%s\" to \"%s\"): %s\n",
+                                        schemaName, relName, src, dst, 
strerror(errno));
+
+               len -= ret;
+       } while (len > 0);
+#else
        /* copy in fairly large chunks for best efficiency */
 #define COPY_BUF_SIZE (50 * BLCKSZ)
 
@@ -77,6 +111,7 @@ copyFile(const char *src, const char *dst,
        }
 
        pg_free(buffer);
+#endif
        close(src_fd);
        close(dest_fd);
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index f98f773ff0..38e88e0395 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -114,6 +114,12 @@
 /* Define to 1 if your compiler handles computed gotos. */
 #undef HAVE_COMPUTED_GOTO
 
+/* Define to 1 if you have the `copyfile' function. */
+#undef HAVE_COPYFILE
+
+/* Define to 1 if you have the `copy_file_range' function. */
+#undef HAVE_COPY_FILE_RANGE
+
 /* Define to 1 if you have the <crtdefs.h> header file. */
 #undef HAVE_CRTDEFS_H
 

base-commit: 9a44a26b65d3d36867267624b76d3dea3dc4f6f6
-- 
2.16.2

Reply via email to