Hello hackers,

Here is an experimental POC of fast/cheap database cloning.  For
clones from little template databases, no one cares much, but it might
be useful to be able to create a snapshot or fork of very large
database for testing/experimentation like this:

  create database foodb_snapshot20231007 template=foodb strategy=file_clone

It should be a lot faster, and use less physical disk, than the two
existing strategies on recent-ish XFS, BTRFS, very recent OpenZFS,
APFS (= macOS), and it could in theory be extended to other systems
that invented different system calls for this with more work (Solaris,
Windows).  Then extra physical disk space will be consumed only as the
two clones diverge.

It's just like the old strategy=file_copy, except it asks the OS to do
its best copying trick.  If you try it on a system that doesn't
support copy-on-write, then copy_file_range() should fall back to
plain old copy, but it might still be better than we could do, as it
can push copy commands to network storage or physical storage.

Therefore, the usual caveats from strategy=file_copy also apply here.
Namely that it has to perform checkpoints which could be very
expensive, and there are some quirks/brokenness about concurrent
backups and PITR.  Which makes me wonder if it's worth pursuing this
idea.  Thoughts?

I tested on bleeding edge FreeBSD/ZFS, where you need to set sysctl
vfs.zfs.bclone_enabled=1 to enable the optimisation, as it's still a
very new feature that is still being rolled out.  The system call
succeeds either way, but that controls whether the new database
initially shares blocks on disk, or get new copies.  I also tested on
a Mac.  In both cases I could clone large databases in a fraction of a
second.
From 341c0287237d59b0723bc88244dc1c48ed06c5a5 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 2 Sep 2023 22:21:49 +1200
Subject: [PATCH] WIP: CREATE DATABASE ... STRATEGY=FILE_CLONE.

Similar to STRATEGY=FILE_COPY, but using facilities that tell the OS
explicitly that we're copying, so that it has the opportunity to share
block ranges in copy-on-write file systems, or maybe push down the copy
to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

XXX need docs
XXX need a test
XXX need redo -- what to do if unsupported during redo, plain copy?

diff --git a/configure b/configure
index d47e0f8b26..2076b19a1b 100755
--- a/configure
+++ b/configure
@@ -15578,7 +15578,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 440b08d113..d0d31dd91e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1767,6 +1767,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 AC_CHECK_FUNCS(m4_normalize([
 	backtrace_symbols
 	copyfile
+	copy_file_range
 	getifaddrs
 	getpeerucred
 	inet_pton
diff --git a/meson.build b/meson.build
index 862c955453..20e7327e9e 100644
--- a/meson.build
+++ b/meson.build
@@ -2415,6 +2415,7 @@ func_checks = [
   ['backtrace_symbols', {'dependencies': [execinfo_dep]}],
   ['clock_gettime', {'dependencies': [rt_dep], 'define': false}],
   ['copyfile'],
+  ['copy_file_range'],
   # gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus
   # when enabling asan the dlopen check doesn't notice that -ldl is actually
   # required. Just checking for dlsym() ought to suffice.
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 307729ab7e..9bbcabbb6f 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -79,11 +79,14 @@
  * CREATEDB_FILE_COPY will simply perform a file system level copy of the
  * database and log a single record for each tablespace copied. To make this
  * safe, it also triggers checkpoints before and after the operation.
+ *
+ * CREATEDB_FILE_CLONE is the same, but uses faster file cloning system calls.
  */
 typedef enum CreateDBStrategy
 {
 	CREATEDB_WAL_LOG,
-	CREATEDB_FILE_COPY
+	CREATEDB_FILE_COPY,
+	CREATEDB_FILE_CLONE
 } CreateDBStrategy;
 
 typedef struct
@@ -137,7 +140,8 @@ static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple,
 static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid,
 									bool isRedo);
 static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid,
-										Oid src_tsid, Oid dst_tsid);
+										Oid src_tsid, Oid dst_tsid,
+										bool clone_files);
 static void recovery_create_dbdir(char *path, bool only_tblspc);
 
 /*
@@ -549,7 +553,7 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo)
  */
 static void
 CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
-							Oid dst_tsid)
+							Oid dst_tsid, bool clone_files)
 {
 	TableScanDesc scan;
 	Relation	rel;
@@ -609,7 +613,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(srcpath, dstpath, false);
+		copydir(srcpath, dstpath, false, clone_files);
 
 		/* Record the filesystem change in XLOG */
 		{
@@ -1010,6 +1014,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			dbstrategy = CREATEDB_WAL_LOG;
 		else if (strcmp(strategy, "file_copy") == 0)
 			dbstrategy = CREATEDB_FILE_COPY;
+		else if (strcmp(strategy, "file_clone") == 0)
+			dbstrategy = CREATEDB_FILE_CLONE;
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1459,7 +1465,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 									  dst_deftablespace);
 		else
 			CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace,
-										dst_deftablespace);
+										dst_deftablespace,
+										dbstrategy == CREATEDB_FILE_CLONE);
 
 		/*
 		 * Close pg_database, but keep lock till commit.
@@ -2095,7 +2102,7 @@ movedb(const char *dbname, const char *tblspcname)
 		/*
 		 * Copy files from the old tablespace to the new one
 		 */
-		copydir(src_dbpath, dst_dbpath, false);
+		copydir(src_dbpath, dst_dbpath, false, false);
 
 		/*
 		 * Record the filesystem change in XLOG
@@ -3251,7 +3258,7 @@ dbase_redo(XLogReaderState *record)
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(src_path, dst_path, false);
+		copydir(src_path, dst_path, false, false);
 
 		pfree(src_path);
 		pfree(dst_path);
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index e04bc3941a..8c963ff548 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -19,14 +19,21 @@
 #include "postgres.h"
 
 #include <fcntl.h>
+#include <limits.h>
 #include <unistd.h>
 
+#ifdef HAVE_COPYFILE_H
+#include <copyfile.h>
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
 
+static void clone_file(const char *fromfile, const char *tofile);
+
 /*
  * copydir: copy a directory
  *
@@ -34,7 +41,7 @@
  * a directory or a regular file is ignored.
  */
 void
-copydir(const char *fromdir, const char *todir, bool recurse)
+copydir(const char *fromdir, const char *todir, bool recurse, bool clone_files)
 {
 	DIR		   *xldir;
 	struct dirent *xlde;
@@ -68,10 +75,15 @@ copydir(const char *fromdir, const char *todir, bool recurse)
 		{
 			/* recurse to handle subdirectories */
 			if (recurse)
-				copydir(fromfile, tofile, true);
+				copydir(fromfile, tofile, true, clone_files);
 		}
 		else if (xlde_type == PGFILETYPE_REG)
-			copy_file(fromfile, tofile);
+		{
+			if (clone_files)
+				clone_file(fromfile, tofile);
+			else
+				copy_file(fromfile, tofile);
+		}
 	}
 	FreeDir(xldir);
 
@@ -214,3 +226,59 @@ copy_file(const char *fromfile, const char *tofile)
 
 	pfree(buffer);
 }
+
+/*
+ * clone one file
+ */
+static void
+clone_file(const char *fromfile, const char *tofile)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not clone file \"%s\" to \"%s\": %m",
+						fromfile, tofile)));
+#elif defined(HAVE_COPY_FILE_RANGE)
+	int			srcfd;
+	int			dstfd;
+	ssize_t		nbytes;
+
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fromfile)));
+
+	dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (dstfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tofile)));
+
+	do
+	{
+		nbytes = copy_file_range(srcfd, NULL, dstfd, NULL, SSIZE_MAX, 0);
+		if (nbytes < 0 && errno != EINTR)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not clone file \"%s\" to \"%s\": %m",
+							fromfile, tofile)));
+	}
+	while (nbytes > 0);
+
+	if (CloseTransientFile(dstfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tofile)));
+
+	if (CloseTransientFile(srcfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fromfile)));
+#else
+	ereport(ERROR,
+			(errcode_for_file_access(),
+			 errmsg("no file clone facility on this platform")));
+#endif
+}
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d8a2985567..d787484259 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -81,6 +81,9 @@
 /* Define to 1 if you have the <copyfile.h> header file. */
 #undef HAVE_COPYFILE_H
 
+/* Define to 1 if you have the `copy_file_range' function. */
+#undef HAVE_COPY_FILE_RANGE
+
 /* Define to 1 if you have the <crtdefs.h> header file. */
 #undef HAVE_CRTDEFS_H
 
diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a8be5b21e0..0ad4df936a 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,7 +13,8 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
-extern void copydir(const char *fromdir, const char *todir, bool recurse);
+extern void copydir(const char *fromdir, const char *todir, bool recurse,
+					bool clone_files);
 extern void copy_file(const char *fromfile, const char *tofile);
 
 #endif							/* COPYDIR_H */
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index a50f730260..3d72a6e4aa 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -229,6 +229,7 @@ sub GenerateFiles
 		HAVE_COMPUTED_GOTO => undef,
 		HAVE_COPYFILE => undef,
 		HAVE_COPYFILE_H => undef,
+		HAVE_COPY_FILE_RANGE => undef,
 		HAVE_CRTDEFS_H => undef,
 		HAVE_CRYPTO_LOCK => undef,
 		HAVE_DECL_FDATASYNC => 0,
-- 
2.42.0

Reply via email to