On Wed, Mar 6, 2024 at 3:16 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> Here's a rebase.

Now with a wait event and a paragraph of documentation.
From 9d5a60e9a9cc4a4312de3081be99c254a8876e42 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 2 Sep 2023 22:21:49 +1200
Subject: [PATCH v4] CREATE DATABASE ... STRATEGY=FILE_CLONE.

Similar to STRATEGY=FILE_COPY, but attempting to use efficient file
copying system calls.  The kernel has the opportunity to share block
ranges in copy-on-write file systems, or maybe push down the copy to
network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

XXX need redo -- what to do if unsupported during redo, fall back to plain copy?

Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 doc/src/sgml/ref/create_database.sgml         |  6 ++
 src/backend/commands/dbcommands.c             | 19 +++--
 src/backend/storage/file/copydir.c            | 82 ++++++++++++++++++-
 .../utils/activity/wait_event_names.txt       |  1 +
 src/include/storage/copydir.h                 |  3 +-
 5 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml
index 72927960ebb..6ed82ee98dd 100644
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@@ -138,6 +138,12 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
         it also forces the system to perform a checkpoint both before and
         after the creation of the new database. In some situations, this may
         have a noticeable negative impact on overall system performance.
+        On some platforms and file systems, the <literal>FILE_CLONE</literal>
+        strategy is available.  This works the same way as
+        <literal>FILE_COPY</literal>, except that it uses fast file cloning
+        or copying system calls that might push down the work of copying to the
+        storage, or use copy-on-write techniques.  The effect on disk space
+        usage and execution time is file system-dependent.
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index b256d6d0f7d..8ccfd18b4c9 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -78,11 +78,14 @@
  * CREATEDB_FILE_COPY will simply perform a file system level copy of the
  * database and log a single record for each tablespace copied. To make this
  * safe, it also triggers checkpoints before and after the operation.
+ *
+ * CREATEDB_FILE_CLONE is the same, but uses faster file cloning system calls.
  */
 typedef enum CreateDBStrategy
 {
 	CREATEDB_WAL_LOG,
 	CREATEDB_FILE_COPY,
+	CREATEDB_FILE_CLONE,
 } CreateDBStrategy;
 
 typedef struct
@@ -136,7 +139,8 @@ static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple,
 static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid,
 									bool isRedo);
 static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid,
-										Oid src_tsid, Oid dst_tsid);
+										Oid src_tsid, Oid dst_tsid,
+										bool clone_files);
 static void recovery_create_dbdir(char *path, bool only_tblspc);
 
 /*
@@ -548,7 +552,7 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo)
  */
 static void
 CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
-							Oid dst_tsid)
+							Oid dst_tsid, bool clone_files)
 {
 	TableScanDesc scan;
 	Relation	rel;
@@ -608,7 +612,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(srcpath, dstpath, false);
+		copydir(srcpath, dstpath, false, clone_files);
 
 		/* Record the filesystem change in XLOG */
 		{
@@ -1010,6 +1014,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			dbstrategy = CREATEDB_WAL_LOG;
 		else if (strcmp(strategy, "file_copy") == 0)
 			dbstrategy = CREATEDB_FILE_COPY;
+		else if (strcmp(strategy, "file_clone") == 0)
+			dbstrategy = CREATEDB_FILE_CLONE;
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1460,7 +1466,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 									  dst_deftablespace);
 		else
 			CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace,
-										dst_deftablespace);
+										dst_deftablespace,
+										dbstrategy == CREATEDB_FILE_CLONE);
 
 		/*
 		 * Close pg_database, but keep lock till commit.
@@ -2096,7 +2103,7 @@ movedb(const char *dbname, const char *tblspcname)
 		/*
 		 * Copy files from the old tablespace to the new one
 		 */
-		copydir(src_dbpath, dst_dbpath, false);
+		copydir(src_dbpath, dst_dbpath, false, false);
 
 		/*
 		 * Record the filesystem change in XLOG
@@ -3255,7 +3262,7 @@ dbase_redo(XLogReaderState *record)
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(src_path, dst_path, false);
+		copydir(src_path, dst_path, false, false);
 
 		pfree(src_path);
 		pfree(dst_path);
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index d4fbe542077..7eeec1c34e7 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,12 +21,18 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#ifdef HAVE_COPYFILE_H
+#include <copyfile.h>
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
 
+static void clone_file(const char *fromfile, const char *tofile);
+
 /*
  * copydir: copy a directory
  *
@@ -34,7 +40,7 @@
  * a directory or a regular file is ignored.
  */
 void
-copydir(const char *fromdir, const char *todir, bool recurse)
+copydir(const char *fromdir, const char *todir, bool recurse, bool clone_files)
 {
 	DIR		   *xldir;
 	struct dirent *xlde;
@@ -68,10 +74,15 @@ copydir(const char *fromdir, const char *todir, bool recurse)
 		{
 			/* recurse to handle subdirectories */
 			if (recurse)
-				copydir(fromfile, tofile, true);
+				copydir(fromfile, tofile, true, clone_files);
 		}
 		else if (xlde_type == PGFILETYPE_REG)
-			copy_file(fromfile, tofile);
+		{
+			if (clone_files)
+				clone_file(fromfile, tofile);
+			else
+				copy_file(fromfile, tofile);
+		}
 	}
 	FreeDir(xldir);
 
@@ -214,3 +225,68 @@ copy_file(const char *fromfile, const char *tofile)
 
 	pfree(buffer);
 }
+
+/*
+ * clone one file
+ */
+static void
+clone_file(const char *fromfile, const char *tofile)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not clone file \"%s\" to \"%s\": %m",
+						fromfile, tofile)));
+#elif defined(HAVE_COPY_FILE_RANGE)
+	int			srcfd;
+	int			dstfd;
+	ssize_t		nbytes;
+
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fromfile)));
+
+	dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (dstfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tofile)));
+
+	do
+	{
+		/* If we got a cancel signal during the copy of the file, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Don't copy too much at once, so we can check for interrupts from
+		 * time to time if this falls back to a slow copy.
+		 */
+		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_COPY);
+		nbytes = copy_file_range(srcfd, NULL, dstfd, NULL, 1024 * 1024, 0);
+		if (nbytes < 0 && errno != EINTR)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not clone file \"%s\" to \"%s\": %m",
+							fromfile, tofile)));
+		pgstat_report_wait_end();
+	}
+	while (nbytes != 0);
+
+	if (CloseTransientFile(dstfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tofile)));
+
+	if (CloseTransientFile(srcfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fromfile)));
+#else
+	ereport(ERROR,
+			(errcode_for_file_access(),
+			 errmsg("no file clone facility on this platform")));
+#endif
+}
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index ec2f31f82af..6ec53281009 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -188,6 +188,7 @@ CONTROL_FILE_SYNC	"Waiting for the <filename>pg_control</filename> file to reach
 CONTROL_FILE_SYNC_UPDATE	"Waiting for an update to the <filename>pg_control</filename> file to reach durable storage."
 CONTROL_FILE_WRITE	"Waiting for a write to the <filename>pg_control</filename> file."
 CONTROL_FILE_WRITE_UPDATE	"Waiting for a write to update the <filename>pg_control</filename> file."
+COPY_FILE_COPY	"Waiting for a file copy operation."
 COPY_FILE_READ	"Waiting for a read during a file copy operation."
 COPY_FILE_WRITE	"Waiting for a write during a file copy operation."
 DATA_FILE_EXTEND	"Waiting for a relation data file to be extended."
diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..9ff28f2eec9 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,7 +13,8 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
-extern void copydir(const char *fromdir, const char *todir, bool recurse);
+extern void copydir(const char *fromdir, const char *todir, bool recurse,
+					bool clone_files);
 extern void copy_file(const char *fromfile, const char *tofile);
 
 #endif							/* COPYDIR_H */
-- 
2.43.0

Reply via email to