On Mon, Oct 9, 2023 at 2:20 AM Andrew Dunstan <and...@dunslane.net> wrote: > I've had to disable COW on my BTRFS-resident buildfarm animals (see > previous discussion re Direct I/O).
Right, because it is still buggy[1]. I don't see any sign that a fix has been committed yet, assuming that is the right thing (and it sure sounds like it). It means you still have to disable COW to run the 004_io_direct.pl test for now, but that's an independent thing due hopefully to be fixed soon, and you can still run PostgreSQL just fine with COW enabled as it is by default as long as you don't turn on debug_io_direct (which isn't for users yet anyway). Since I hadn't actually tried out this cloning stuff out on Linux/btrfs before and was claiming that it should work, I took it for a quick unscientific spin (literally, this is on a spinning SATA disk for extra crunchy slowness...). I created a scale 500 pgbench database, saw that du -h showed 7.4G, and got these times: postgres=# create database foodb_copy template=foodb strategy=file_copy; CREATE DATABASE Time: 124019.885 ms (02:04.020) postgres=# create database foodb_clone template=foodb strategy=file_clone; CREATE DATABASE Time: 8618.195 ms (00:08.618) That's something, but not as good as I was expecting, so let's also try Linux/XFS for reference on the same spinny rust... One thing I learned is that if you have an existing XFS partition, it might have been created without reflinks enabled (see output of xfs_info) as that was the default not very long ago and it's not changeable later, so on the box I'm writing from I had to do a fresh mkfs.xfs to see any benefit from this. postgres=# create database foodb_copy template=foodb strategy=file_copy; CREATE DATABASE Time: 49157.876 ms (00:49.158) postgres=# create database foodb_clone template=foodb strategy=file_clone; CREATE DATABASE Time: 1026.455 ms (00:01.026) Not bad. To understand what that did, we can check which physical blocks on disk hold the first segment of the pgbench_accounts table in foodb and foodb_clone: $ sudo xfs_bmap /mnt/xfs/pgdata/base/16384/16400 /mnt/xfs/pgdata/base/16384/16400: 0: [0..1637439]: 977586048..979223487 1: [1637440..2097151]: 1464966136..1465425847 $ sudo xfs_bmap /mnt/xfs/pgdata/base/16419/16400 /mnt/xfs/pgdata/base/16419/16400: 0: [0..1637439]: 977586048..979223487 1: [1637440..2097151]: 1464966136..1465425847 The same blocks. Now let's update a tuple on the second page of pgbench_accounts in the clone: foodb=# update pgbench_accounts set bid = bid + 1 where ctid = '(1, 1)'; UPDATE 1 foodb=# checkpoint; CHECKPOINT Now some new physical disk blocks have been allocated just for that page, but the rest are still clones: $ sudo xfs_bmap /mnt/xfs/pgdata/base/16419/16400 /mnt/xfs/pgdata/base/16419/16400: 0: [0..15]: 977586048..977586063 1: [16..31]: 977586064..977586079 2: [32..1637439]: 977586080..979223487 3: [1637440..2097151]: 1464966136..1465425847 I tried changing it to work in 1MB chunks and add the CFI() (v2 attached), and it didn't affect the time measurably and also didn't generate any extra extents as displayed by xfs_bmap, so the end result is the same. I haven't looked into the chunked version on the other file systems yet. I don't have the numbers to hand (different machines far from me right now) but FreeBSD/ZFS and macOS/APFS were on the order of a few hundred milliseconds for the same scale of pgbench on laptop storage (so not comparable with the above). I also tried a -s 5000 database, and saw that XFS could clone a 74GB database just as fast as the 7.4GB database (still ~1s). At a guess, this is going to scale not so much by total data size, but more by things like number of relations, segment size and internal (invisible) fragmentation due to previous cloning/update history in filesystem-dependent ways, since those are the things that generate extents (contiguous ranges of physical blocks to be referenced by the new file). [1] https://lore.kernel.org/linux-btrfs/ae81e48b0e954bae1c3451c0da1a24ae7146606c.1676684984.git.bo...@bur.io/T/#u
From dd5c07d873e90a6feac371d2879015a5e6154632 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sat, 2 Sep 2023 22:21:49 +1200 Subject: [PATCH v2] WIP: CREATE DATABASE ... STRATEGY=FILE_CLONE. Similar to STRATEGY=FILE_COPY, but using facilities that tell the OS explicitly that we're copying, so that it has the opportunity to share block ranges in copy-on-write file systems, or maybe push down the copy to network file systems and storage devices. Currently works on Linux, FreeBSD and macOS. More systems could be supported. XXX need docs XXX need to think more about chunk size and interruptibility XXX need redo -- what to do if unsupported during redo, plain copy? Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com --- configure | 2 +- configure.ac | 1 + meson.build | 1 + src/backend/commands/dbcommands.c | 21 +++++--- src/backend/storage/file/copydir.c | 80 ++++++++++++++++++++++++++++-- src/include/pg_config.h.in | 3 ++ src/include/storage/copydir.h | 3 +- src/tools/msvc/Solution.pm | 1 + 8 files changed, 100 insertions(+), 12 deletions(-) diff --git a/configure b/configure index d47e0f8b26..2076b19a1b 100755 --- a/configure +++ b/configure @@ -15578,7 +15578,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l +for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 440b08d113..d0d31dd91e 100644 --- a/configure.ac +++ b/configure.ac @@ -1767,6 +1767,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` AC_CHECK_FUNCS(m4_normalize([ backtrace_symbols copyfile + copy_file_range getifaddrs getpeerucred inet_pton diff --git a/meson.build b/meson.build index 862c955453..20e7327e9e 100644 --- a/meson.build +++ b/meson.build @@ -2415,6 +2415,7 @@ func_checks = [ ['backtrace_symbols', {'dependencies': [execinfo_dep]}], ['clock_gettime', {'dependencies': [rt_dep], 'define': false}], ['copyfile'], + ['copy_file_range'], # gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus # when enabling asan the dlopen check doesn't notice that -ldl is actually # required. Just checking for dlsym() ought to suffice. diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 307729ab7e..9bbcabbb6f 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -79,11 +79,14 @@ * CREATEDB_FILE_COPY will simply perform a file system level copy of the * database and log a single record for each tablespace copied. To make this * safe, it also triggers checkpoints before and after the operation. + * + * CREATEDB_FILE_CLONE is the same, but uses faster file cloning system calls. */ typedef enum CreateDBStrategy { CREATEDB_WAL_LOG, - CREATEDB_FILE_COPY + CREATEDB_FILE_COPY, + CREATEDB_FILE_CLONE } CreateDBStrategy; typedef struct @@ -137,7 +140,8 @@ static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple, static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo); static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, - Oid src_tsid, Oid dst_tsid); + Oid src_tsid, Oid dst_tsid, + bool clone_files); static void recovery_create_dbdir(char *path, bool only_tblspc); /* @@ -549,7 +553,7 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo) */ static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, - Oid dst_tsid) + Oid dst_tsid, bool clone_files) { TableScanDesc scan; Relation rel; @@ -609,7 +613,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, * * We don't need to copy subdirectories */ - copydir(srcpath, dstpath, false); + copydir(srcpath, dstpath, false, clone_files); /* Record the filesystem change in XLOG */ { @@ -1010,6 +1014,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbstrategy = CREATEDB_WAL_LOG; else if (strcmp(strategy, "file_copy") == 0) dbstrategy = CREATEDB_FILE_COPY; + else if (strcmp(strategy, "file_clone") == 0) + dbstrategy = CREATEDB_FILE_CLONE; else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1459,7 +1465,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dst_deftablespace); else CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace, - dst_deftablespace); + dst_deftablespace, + dbstrategy == CREATEDB_FILE_CLONE); /* * Close pg_database, but keep lock till commit. @@ -2095,7 +2102,7 @@ movedb(const char *dbname, const char *tblspcname) /* * Copy files from the old tablespace to the new one */ - copydir(src_dbpath, dst_dbpath, false); + copydir(src_dbpath, dst_dbpath, false, false); /* * Record the filesystem change in XLOG @@ -3251,7 +3258,7 @@ dbase_redo(XLogReaderState *record) * * We don't need to copy subdirectories */ - copydir(src_path, dst_path, false); + copydir(src_path, dst_path, false, false); pfree(src_path); pfree(dst_path); diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index e04bc3941a..4806a60d4c 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -21,12 +21,18 @@ #include <fcntl.h> #include <unistd.h> +#ifdef HAVE_COPYFILE_H +#include <copyfile.h> +#endif + #include "common/file_utils.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/copydir.h" #include "storage/fd.h" +static void clone_file(const char *fromfile, const char *tofile); + /* * copydir: copy a directory * @@ -34,7 +40,7 @@ * a directory or a regular file is ignored. */ void -copydir(const char *fromdir, const char *todir, bool recurse) +copydir(const char *fromdir, const char *todir, bool recurse, bool clone_files) { DIR *xldir; struct dirent *xlde; @@ -68,10 +74,15 @@ copydir(const char *fromdir, const char *todir, bool recurse) { /* recurse to handle subdirectories */ if (recurse) - copydir(fromfile, tofile, true); + copydir(fromfile, tofile, true, clone_files); } else if (xlde_type == PGFILETYPE_REG) - copy_file(fromfile, tofile); + { + if (clone_files) + clone_file(fromfile, tofile); + else + copy_file(fromfile, tofile); + } } FreeDir(xldir); @@ -214,3 +225,66 @@ copy_file(const char *fromfile, const char *tofile) pfree(buffer); } + +/* + * clone one file + */ +static void +clone_file(const char *fromfile, const char *tofile) +{ +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not clone file \"%s\" to \"%s\": %m", + fromfile, tofile))); +#elif defined(HAVE_COPY_FILE_RANGE) + int srcfd; + int dstfd; + ssize_t nbytes; + + srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); + if (srcfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fromfile))); + + dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (dstfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tofile))); + + do + { + /* If we got a cancel signal during the copy of the file, quit */ + CHECK_FOR_INTERRUPTS(); + + /* + * Don't copy too much at once, so we can check for interrupts from + * time to time if this falls back to a slow copy. + */ + nbytes = copy_file_range(srcfd, NULL, dstfd, NULL, 1024 * 1024, 0); + if (nbytes < 0 && errno != EINTR) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not clone file \"%s\" to \"%s\": %m", + fromfile, tofile))); + } + while (nbytes > 0); + + if (CloseTransientFile(dstfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tofile))); + + if (CloseTransientFile(srcfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", fromfile))); +#else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("no file clone facility on this platform"))); +#endif +} diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8a2985567..d787484259 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -81,6 +81,9 @@ /* Define to 1 if you have the <copyfile.h> header file. */ #undef HAVE_COPYFILE_H +/* Define to 1 if you have the `copy_file_range' function. */ +#undef HAVE_COPY_FILE_RANGE + /* Define to 1 if you have the <crtdefs.h> header file. */ #undef HAVE_CRTDEFS_H diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h index a8be5b21e0..0ad4df936a 100644 --- a/src/include/storage/copydir.h +++ b/src/include/storage/copydir.h @@ -13,7 +13,8 @@ #ifndef COPYDIR_H #define COPYDIR_H -extern void copydir(const char *fromdir, const char *todir, bool recurse); +extern void copydir(const char *fromdir, const char *todir, bool recurse, + bool clone_files); extern void copy_file(const char *fromfile, const char *tofile); #endif /* COPYDIR_H */ diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index a50f730260..3d72a6e4aa 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -229,6 +229,7 @@ sub GenerateFiles HAVE_COMPUTED_GOTO => undef, HAVE_COPYFILE => undef, HAVE_COPYFILE_H => undef, + HAVE_COPY_FILE_RANGE => undef, HAVE_CRTDEFS_H => undef, HAVE_CRYPTO_LOCK => undef, HAVE_DECL_FDATASYNC => 0, -- 2.39.2