On 3/19/18 22:58, Michael Paquier wrote: > I have been thinking about this patch over the night, and here is a list > of bullet points which would be nice to tackle: > - Remove the current diff in copydir.
done > - Extend copy_file so as it is able to use fcopyfile. fcopyfile() does not support cloning. (This is not documented.) > - Move the work done in pg_upgrade into a common API which can as well > be used by pg_rewind as well. One place would be to have a > frontend-only API in src/common which does the leg work. I would > recommend working only on file descriptors as well for consistency with > copy_file_range. pg_upgrade copies files, whereas pg_rewind needs to copy file ranges. So I don't think this is going to be a good match. We could add support for using Linux copy_file_range() in pg_rewind, but that would work a bit differently. I also don't have a good sense of how to test the performance of that. Another thing to think about is that we go through some trouble to initialize new WAL files so that the disk space is fully allocated. If we used file cloning calls in pg_rewind, that would potentially invalidate some of that. At least, we'd have to think through this more carefully. > - Add proper wait events for the backend calls. Those are missing for > copy_file_range and copyfile. done > - For XLogFileCopy, the problem may be trickier as the tail of a segment > is filled with zeroes, so dropping it from the first version of the > patch sounds wiser. Seems like a possible follow-on project. But see also under pg_rewind above. Another oddity is that pg_upgrade uses CopyFile() on Windows, but the backend does not. The Git log shows that the backend used to use CopyFile(), but that was then removed when the generic code was added, but when pg_upgrade was imported, it came with the CopyFile() call. I suspect the CopyFile() call can be quite a bit faster, so we should consider adding it back in. Or if not, remove it from pg_upgrade. -- Peter Eisentraut http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
From bd8fe105f6b1c64098e344c4a7d0fc9c94d2e31d Mon Sep 17 00:00:00 2001 From: Peter Eisentraut <pete...@gmx.net> Date: Tue, 20 Mar 2018 10:21:47 -0400 Subject: [PATCH v2] Use file cloning in pg_upgrade and CREATE DATABASE For file copying in pg_upgrade and CREATE DATABASE, use special file cloning calls if available. This makes the copying faster and more space efficient. For pg_upgrade, this achieves speed similar to --link mode without the associated drawbacks. Other backend users of copydir.c will also take advantage of these changes, but the performance improvement will probably not be as noticeable there. On Linux, use copy_file_range(). This supports file cloning automatically on Btrfs and XFS (if formatted with reflink support). On macOS, use copyfile(), which supports file cloning on APFS. Even on file systems without cloning/reflink support, this is faster than the existing code, because it avoids copying the file contents out of kernel space and allows the OS to apply other optimizations. --- configure | 2 +- configure.in | 2 +- doc/src/sgml/monitoring.sgml | 8 +++- doc/src/sgml/ref/pgupgrade.sgml | 11 +++++ src/backend/postmaster/pgstat.c | 3 ++ src/backend/storage/file/copydir.c | 84 ++++++++++++++++++++++++++++++-------- src/backend/storage/file/reinit.c | 3 +- src/bin/pg_upgrade/file.c | 56 +++++++++++++++++++------ src/include/pg_config.h.in | 6 +++ src/include/pgstat.h | 1 + 10 files changed, 141 insertions(+), 35 deletions(-) diff --git a/configure b/configure index 3943711283..f27c78f63a 100755 --- a/configure +++ b/configure @@ -13085,7 +13085,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l +for ac_func in cbrt clock_gettime copy_file_range copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 1babdbb755..7eb8673753 100644 --- a/configure.in +++ b/configure.in @@ -1428,7 +1428,7 @@ PGAC_FUNC_WCSTOMBS_L LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) +AC_CHECK_FUNCS([cbrt clock_gettime copy_file_range copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 3bc4de57d5..02029e81bc 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1418,7 +1418,7 @@ <title><structname>wait_event</structname> Description</title> <entry>Waiting to apply WAL at recovery because it is delayed.</entry> </row> <row> - <entry morerows="65"><literal>IO</literal></entry> + <entry morerows="66"><literal>IO</literal></entry> <entry><literal>BufFileRead</literal></entry> <entry>Waiting for a read from a buffered file.</entry> </row> @@ -1446,6 +1446,12 @@ <title><structname>wait_event</structname> Description</title> <entry><literal>ControlFileWriteUpdate</literal></entry> <entry>Waiting for a write to update the control file.</entry> </row> + <row> + <entry><literal>CopyFileCopy</literal></entry> + <entry>Waiting for a file copy operation (if the copying is done by + an operating system call rather than as separate read and write + operations).</entry> + </row> <row> <entry><literal>CopyFileRead</literal></entry> <entry>Waiting for a read during a file copy operation.</entry> diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 6dafb404a1..3873e71dd1 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -737,6 +737,17 @@ <title>Notes</title> is down. </para> + <para> + In PostgreSQL 11 and later, <application>pg_upgrade</application> + automatically uses efficient file cloning (also known as + <quote>reflinks</quote>) on some operating systems and file systems. This + can result in near-instantaneous copying of the data files, giving the + speed advantages of <option>-k</option>/<option>--link</option> while + leaving the old cluster untouched. At present, this is supported on Linux + (kernel 4.5 or later, glibc 2.27 or later) with Btrfs and XFS (on file + systems created with reflink support, which is not the default for XFS at + this writing), and on macOS with APFS. + </para> </refsect1> <refsect1> diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 96ba216387..4feb3a5289 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3744,6 +3744,9 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE: event_name = "ControlFileWriteUpdate"; break; + case WAIT_EVENT_COPY_FILE_COPY: + event_name = "CopyFileCopy"; + break; case WAIT_EVENT_COPY_FILE_READ: event_name = "CopyFileRead"; break; diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index ca6342db0d..5aa9742b51 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -21,6 +21,9 @@ #include <fcntl.h> #include <unistd.h> #include <sys/stat.h> +#ifdef HAVE_COPYFILE +#include <copyfile.h> +#endif #include "storage/copydir.h" #include "storage/fd.h" @@ -126,13 +129,71 @@ copydir(char *fromdir, char *todir, bool recurse) void copy_file(char *fromfile, char *tofile) { - char *buffer; +#ifdef HAVE_COPYFILE + int ret; + + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_COPY); + ret = copyfile(fromfile, tofile, NULL, +#ifdef COPYFILE_CLONE + COPYFILE_CLONE +#else + COPYFILE_DATA +#endif + ); + pgstat_report_wait_end(); + if (ret < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not copy file \"%s\" to \"%s\": %m", fromfile, tofile))); +#else int srcfd; int dstfd; +#ifdef HAVE_COPY_FILE_RANGE + struct stat stat; + size_t len; +#else + char *buffer; int nbytes; off_t offset; off_t flush_offset; +#endif + + /* + * Open the files + */ + srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); + if (srcfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fromfile))); + + dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (dstfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tofile))); + +#ifdef HAVE_COPY_FILE_RANGE + if (fstat(srcfd, &stat) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", fromfile))); + + len = stat.st_size; + do { + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_COPY); + ssize_t ret = copy_file_range(srcfd, NULL, dstfd, NULL, len, 0); + pgstat_report_wait_end(); + if (ret < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not copy file \"%s\" to \"%s\": %m", + fromfile, tofile))); + + len -= ret; + } while (len > 0); +#else /* Size of copy buffer (read and write requests) */ #define COPY_BUF_SIZE (8 * BLCKSZ) @@ -151,21 +212,6 @@ copy_file(char *fromfile, char *tofile) /* Use palloc to ensure we get a maxaligned buffer */ buffer = palloc(COPY_BUF_SIZE); - /* - * Open the files - */ - srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); - if (srcfd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fromfile))); - - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); - if (dstfd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", tofile))); - /* * Do the data copying. */ @@ -213,12 +259,14 @@ copy_file(char *fromfile, char *tofile) if (offset > flush_offset) pg_flush_data(dstfd, flush_offset, offset - flush_offset); + pfree(buffer); +#endif + if (CloseTransientFile(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); CloseTransientFile(srcfd); - - pfree(buffer); +#endif } diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 92363ae6ad..2614b27307 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -314,8 +314,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) FreeDir(dbspace_dir); /* - * copy_file() above has already called pg_flush_data() on the files - * it created. Now we need to fsync those files, because a checkpoint + * Now we need to fsync the copied files, because a checkpoint * won't do it for us while we're in recovery. We do this in a * separate pass to allow the kernel to perform all the flushes * (especially the metadata ones) at once. diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index f38bfacf02..4354b64b50 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -17,6 +17,9 @@ #include <sys/stat.h> #include <fcntl.h> +#ifdef HAVE_COPYFILE +#include <copyfile.h> +#endif #ifdef WIN32 @@ -34,10 +37,32 @@ void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName) { -#ifndef WIN32 +#if defined(HAVE_COPYFILE) + if (copyfile(src, dst, NULL, +#ifdef COPYFILE_CLONE + COPYFILE_CLONE +#else + COPYFILE_DATA +#endif + ) < 0) + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); +#elif defined(WIN32) + if (CopyFile(src, dst, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } +#else int src_fd; int dest_fd; +#ifdef HAVE_COPY_FILE_RANGE + struct stat stat; + size_t len; +#else char *buffer; +#endif if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n", @@ -48,6 +73,22 @@ copyFile(const char *src, const char *dst, pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n", schemaName, relName, dst, strerror(errno)); +#ifdef HAVE_COPY_FILE_RANGE + if (fstat(src_fd, &stat) < 0) + pg_fatal("could not stat file \"%s\": %s", + src, strerror(errno)); + + len = stat.st_size; + + do { + ssize_t ret = copy_file_range(src_fd, NULL, dest_fd, NULL, len, 0); + if (ret < 0) + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + + len -= ret; + } while (len > 0); +#else /* copy in fairly large chunks for best efficiency */ #define COPY_BUF_SIZE (50 * BLCKSZ) @@ -77,19 +118,10 @@ copyFile(const char *src, const char *dst, } pg_free(buffer); +#endif close(src_fd); close(dest_fd); - -#else /* WIN32 */ - - if (CopyFile(src, dst, true) == 0) - { - _dosmaperr(GetLastError()); - pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", - schemaName, relName, src, dst, strerror(errno)); - } - -#endif /* WIN32 */ +#endif } diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index f98f773ff0..38e88e0395 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -114,6 +114,12 @@ /* Define to 1 if your compiler handles computed gotos. */ #undef HAVE_COMPUTED_GOTO +/* Define to 1 if you have the `copyfile' function. */ +#undef HAVE_COPYFILE + +/* Define to 1 if you have the `copy_file_range' function. */ +#undef HAVE_COPY_FILE_RANGE + /* Define to 1 if you have the <crtdefs.h> header file. */ #undef HAVE_CRTDEFS_H diff --git a/src/include/pgstat.h b/src/include/pgstat.h index be2f59239b..934bce0fa9 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -863,6 +863,7 @@ typedef enum WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE, WAIT_EVENT_CONTROL_FILE_WRITE, WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE, + WAIT_EVENT_COPY_FILE_COPY, WAIT_EVENT_COPY_FILE_READ, WAIT_EVENT_COPY_FILE_WRITE, WAIT_EVENT_DATA_FILE_EXTEND, base-commit: 13c7c65ec900a30bcddcb27f5fd138dcdbc2ca2e -- 2.16.2