On Wed, Oct 11, 2023 at 7:40 PM Peter Eisentraut <pe...@eisentraut.org> wrote:
> On 07.10.23 07:51, Thomas Munro wrote:
> > Here is an experimental POC of fast/cheap database cloning.
>
> Here are some previous discussions of this:
>
> https://www.postgresql.org/message-id/flat/20131001223108.GG23410%40saarenmaa.fi
>
> https://www.postgresql.org/message-id/flat/511B5D11.4040507%40socialserve.com
>
> https://www.postgresql.org/message-id/flat/bc9ca382-b98d-0446-f699-8c5de2307ca7%402ndquadrant.com
>
> (I don't see any clear conclusions in any of these threads, but it might
> be good to check them in any case.)

Thanks.  Wow, quite a lot of people have written an experimental patch
like this.  I would say the things that changed since those ones are:

* copy_file_range() became the preferred way to do this on Linux AFAIK
(instead of various raw ioctls)
* FreeBSD adopted Linux's copy_file_range()
* Open ZFS 2.2 implemented range-based cloning
* XFS enabled reflink support by default
* Apple invented ApFS with cloning
* Several OSes adopted XFS, BTRFS, ZFS, ApFS by default
* copy_file_range() went in the direction of not revealing how the
copying is done (no flags to force behaviour)

Here's a rebase.

The main thing that is missing is support for redo.  It's mostly
trivial I think, probably just a record type for "try cloning first"
and then teaching that clone function to fall back to the regular copy
path if it fails in recovery, do you agree with that idea?  Another
approach would be to let it fail if it doesn't work on the replica, so
you don't finish up using dramatically different amounts of disk
space, but that seems terrible because now your replica is broken.  So
probably fallback with logged warning (?), though I'm not sure exactly
which errnos to give that treatment to.

One thing to highlight about COW file system semantics: PostgreSQL
behaves differently when space runs out.  When writing relation data,
eg ZFS sometimes fails like bullet point 2 in this ENOSPC article[1],
while XFS usually fails like bullet point 1.  A database on XFS that
has been cloned in this way might presumably start to fail like bullet
point 2, eg when checkpointing dirty pages, instead of its usual
extension-time-only ENOSPC-rolls-back-your-transaction behaviour.

[1] https://wiki.postgresql.org/wiki/ENOSPC
From 49b7077e24da9d8140de3f303d3f90cb3c0dd0f8 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 2 Sep 2023 22:21:49 +1200
Subject: [PATCH v3] CREATE DATABASE ... STRATEGY=FILE_CLONE.

Similar to STRATEGY=FILE_COPY, but attempting to use efficient file
copying system calls.  The kernel has the opportunity to share block
ranges in copy-on-write file systems, or maybe push down the copy to
network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

XXX need redo -- what to do if unsupported during redo, fall back to plain copy?

Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/backend/commands/dbcommands.c  | 19 ++++---
 src/backend/storage/file/copydir.c | 80 ++++++++++++++++++++++++++++--
 src/include/storage/copydir.h      |  3 +-
 3 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index b256d6d0f7d..8ccfd18b4c9 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -78,11 +78,14 @@
  * CREATEDB_FILE_COPY will simply perform a file system level copy of the
  * database and log a single record for each tablespace copied. To make this
  * safe, it also triggers checkpoints before and after the operation.
+ *
+ * CREATEDB_FILE_CLONE is the same, but uses faster file cloning system calls.
  */
 typedef enum CreateDBStrategy
 {
 	CREATEDB_WAL_LOG,
 	CREATEDB_FILE_COPY,
+	CREATEDB_FILE_CLONE,
 } CreateDBStrategy;
 
 typedef struct
@@ -136,7 +139,8 @@ static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple,
 static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid,
 									bool isRedo);
 static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid,
-										Oid src_tsid, Oid dst_tsid);
+										Oid src_tsid, Oid dst_tsid,
+										bool clone_files);
 static void recovery_create_dbdir(char *path, bool only_tblspc);
 
 /*
@@ -548,7 +552,7 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo)
  */
 static void
 CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
-							Oid dst_tsid)
+							Oid dst_tsid, bool clone_files)
 {
 	TableScanDesc scan;
 	Relation	rel;
@@ -608,7 +612,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(srcpath, dstpath, false);
+		copydir(srcpath, dstpath, false, clone_files);
 
 		/* Record the filesystem change in XLOG */
 		{
@@ -1010,6 +1014,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			dbstrategy = CREATEDB_WAL_LOG;
 		else if (strcmp(strategy, "file_copy") == 0)
 			dbstrategy = CREATEDB_FILE_COPY;
+		else if (strcmp(strategy, "file_clone") == 0)
+			dbstrategy = CREATEDB_FILE_CLONE;
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1460,7 +1466,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 									  dst_deftablespace);
 		else
 			CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace,
-										dst_deftablespace);
+										dst_deftablespace,
+										dbstrategy == CREATEDB_FILE_CLONE);
 
 		/*
 		 * Close pg_database, but keep lock till commit.
@@ -2096,7 +2103,7 @@ movedb(const char *dbname, const char *tblspcname)
 		/*
 		 * Copy files from the old tablespace to the new one
 		 */
-		copydir(src_dbpath, dst_dbpath, false);
+		copydir(src_dbpath, dst_dbpath, false, false);
 
 		/*
 		 * Record the filesystem change in XLOG
@@ -3255,7 +3262,7 @@ dbase_redo(XLogReaderState *record)
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(src_path, dst_path, false);
+		copydir(src_path, dst_path, false, false);
 
 		pfree(src_path);
 		pfree(dst_path);
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index d4fbe542077..d8150e45ad8 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,12 +21,18 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#ifdef HAVE_COPYFILE_H
+#include <copyfile.h>
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
 
+static void clone_file(const char *fromfile, const char *tofile);
+
 /*
  * copydir: copy a directory
  *
@@ -34,7 +40,7 @@
  * a directory or a regular file is ignored.
  */
 void
-copydir(const char *fromdir, const char *todir, bool recurse)
+copydir(const char *fromdir, const char *todir, bool recurse, bool clone_files)
 {
 	DIR		   *xldir;
 	struct dirent *xlde;
@@ -68,10 +74,15 @@ copydir(const char *fromdir, const char *todir, bool recurse)
 		{
 			/* recurse to handle subdirectories */
 			if (recurse)
-				copydir(fromfile, tofile, true);
+				copydir(fromfile, tofile, true, clone_files);
 		}
 		else if (xlde_type == PGFILETYPE_REG)
-			copy_file(fromfile, tofile);
+		{
+			if (clone_files)
+				clone_file(fromfile, tofile);
+			else
+				copy_file(fromfile, tofile);
+		}
 	}
 	FreeDir(xldir);
 
@@ -214,3 +225,66 @@ copy_file(const char *fromfile, const char *tofile)
 
 	pfree(buffer);
 }
+
+/*
+ * clone one file
+ */
+static void
+clone_file(const char *fromfile, const char *tofile)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not clone file \"%s\" to \"%s\": %m",
+						fromfile, tofile)));
+#elif defined(HAVE_COPY_FILE_RANGE)
+	int			srcfd;
+	int			dstfd;
+	ssize_t		nbytes;
+
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fromfile)));
+
+	dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (dstfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tofile)));
+
+	do
+	{
+		/* If we got a cancel signal during the copy of the file, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Don't copy too much at once, so we can check for interrupts from
+		 * time to time if this falls back to a slow copy.
+		 */
+		nbytes = copy_file_range(srcfd, NULL, dstfd, NULL, 1024 * 1024, 0);
+		if (nbytes < 0 && errno != EINTR)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not clone file \"%s\" to \"%s\": %m",
+							fromfile, tofile)));
+	}
+	while (nbytes != 0);
+
+	if (CloseTransientFile(dstfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tofile)));
+
+	if (CloseTransientFile(srcfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fromfile)));
+#else
+	ereport(ERROR,
+			(errcode_for_file_access(),
+			 errmsg("no file clone facility on this platform")));
+#endif
+}
diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..9ff28f2eec9 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,7 +13,8 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
-extern void copydir(const char *fromdir, const char *todir, bool recurse);
+extern void copydir(const char *fromdir, const char *todir, bool recurse,
+					bool clone_files);
 extern void copy_file(const char *fromfile, const char *tofile);
 
 #endif							/* COPYDIR_H */
-- 
2.43.0

Reply via email to