I've spent quite a bit of time recently trying to get this patch set into a reasonable state. It's still a little rough around the edges, and the code for the generated scripts is incomplete, but I figured I'd at least get some CI testing going.
-- nathan
>From 0af23114cfe5d00ab0b69ff804bb92d58d485adb Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 19 Feb 2025 09:14:51 -0600 Subject: [PATCH v3 1/4] initdb: Add --no-sync-data-files. This new option instructs initdb to skip synchronizing any files in database directories and the database directories themselves, i.e., everything in the base/ subdirectory and any other tablespace directories. Other files, such as those in pg_wal/ and pg_xact/, will still be synchronized unless --no-sync is also specified. --no-sync-data-files is primarily intended for internal use by tools that separately ensure the skipped files are synchronized to disk. A follow-up commit will use this to help optimize pg_upgrade's file transfer step. Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan --- doc/src/sgml/ref/initdb.sgml | 20 +++++ src/bin/initdb/initdb.c | 10 ++- src/bin/initdb/t/001_initdb.pl | 1 + src/bin/pg_basebackup/pg_basebackup.c | 2 +- src/bin/pg_checksums/pg_checksums.c | 2 +- src/bin/pg_combinebackup/pg_combinebackup.c | 2 +- src/bin/pg_rewind/file_ops.c | 2 +- src/common/file_utils.c | 85 +++++++++++++-------- src/include/common/file_utils.h | 2 +- 9 files changed, 89 insertions(+), 37 deletions(-) diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 0026318485a..14c401b9a99 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -527,6 +527,26 @@ PostgreSQL documentation </listitem> </varlistentry> + <varlistentry id="app-initdb-option-no-sync-data-files"> + <term><option>--no-sync-data-files</option></term> + <listitem> + <para> + By default, <command>initdb</command> safely writes all database files + to disk. This option instructs <command>initdb</command> to skip + synchronizing all files in the individual database directories and the + database directories themselves, i.e., everything in the + <filename>base</filename> subdirectory and any other tablespace + directories. Other files, such as those in <literal>pg_wal</literal> + and <literal>pg_xact</literal>, will still be synchronized unless the + <option>--no-sync</option> option is also specified. + </para> + <para> + This option is primarily intended for internal use by tools that + separately ensure the skipped files are synchronized to disk. + </para> + </listitem> + </varlistentry> + <varlistentry id="app-initdb-option-no-instructions"> <term><option>--no-instructions</option></term> <listitem> diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 21a0fe3ecd9..22b7d31b165 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -168,6 +168,7 @@ static bool data_checksums = true; static char *xlog_dir = NULL; static int wal_segment_size_mb = (DEFAULT_XLOG_SEG_SIZE) / (1024 * 1024); static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +static bool sync_data_files = true; /* internal vars */ @@ -2566,6 +2567,7 @@ usage(const char *progname) printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -n, --no-clean do not clean up after errors\n")); printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); + printf(_(" --no-sync-data-files do not sync files within database directories\n")); printf(_(" --no-instructions do not print instructions for next steps\n")); printf(_(" -s, --show show internal settings, then exit\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); @@ -3208,6 +3210,7 @@ main(int argc, char *argv[]) {"icu-rules", required_argument, NULL, 18}, {"sync-method", required_argument, NULL, 19}, {"no-data-checksums", no_argument, NULL, 20}, + {"no-sync-data-files", no_argument, NULL, 21}, {NULL, 0, NULL, 0} }; @@ -3402,6 +3405,9 @@ main(int argc, char *argv[]) case 20: data_checksums = false; break; + case 21: + sync_data_files = false; + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -3453,7 +3459,7 @@ main(int argc, char *argv[]) fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files); check_ok(); return 0; } @@ -3516,7 +3522,7 @@ main(int argc, char *argv[]) { fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files); check_ok(); } else diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 01cc4a1602b..15dd10ce40a 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -76,6 +76,7 @@ command_like( 'checksums are enabled in control file'); command_ok([ 'initdb', '--sync-only', $datadir ], 'sync only'); +command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], '--no-sync-data-files'); command_fails([ 'initdb', $datadir ], 'existing data directory'); if ($supports_syncfs) diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index dc0c805137a..bc94c114d27 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -2310,7 +2310,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, } else { - (void) sync_pgdata(basedir, serverVersion, sync_method); + (void) sync_pgdata(basedir, serverVersion, sync_method, true); } } diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index e1acb6e933d..3bbd8f616cf 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -633,7 +633,7 @@ main(int argc, char *argv[]) if (do_sync) { pg_log_info("syncing data directory"); - sync_pgdata(DataDir, PG_VERSION_NUM, sync_method); + sync_pgdata(DataDir, PG_VERSION_NUM, sync_method, true); } pg_log_info("updating control file"); diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 5864ec574fb..c0ec09485c3 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -420,7 +420,7 @@ main(int argc, char *argv[]) else { pg_log_debug("recursively fsyncing \"%s\"", opt.output); - sync_pgdata(opt.output, version * 10000, opt.sync_method); + sync_pgdata(opt.output, version * 10000, opt.sync_method, true); } } diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 467845419ed..55659ce201f 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -296,7 +296,7 @@ sync_target_dir(void) if (!do_sync || dry_run) return; - sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method); + sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method, true); } diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 0e3cfede935..78e272916f5 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -50,7 +50,8 @@ static int pre_sync_fname(const char *fname, bool isdir); #endif static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), - bool process_symlinks); + bool process_symlinks, + const char *exclude_dir); #ifdef HAVE_SYNCFS @@ -93,11 +94,15 @@ do_syncfs(const char *path) * syncing, and might not have privileges to write at all. * * serverVersion indicates the version of the server to be sync'd. + * + * If sync_data_files is false, this function skips syncing "base/" and any + * other tablespace directories. */ void sync_pgdata(const char *pg_data, int serverVersion, - DataDirSyncMethod sync_method) + DataDirSyncMethod sync_method, + bool sync_data_files) { bool xlog_is_symlink; char pg_wal[MAXPGPATH]; @@ -147,30 +152,33 @@ sync_pgdata(const char *pg_data, do_syncfs(pg_data); /* If any tablespaces are configured, sync each of those. */ - dir = opendir(pg_tblspc); - if (dir == NULL) - pg_log_error("could not open directory \"%s\": %m", - pg_tblspc); - else + if (sync_data_files) { - while (errno = 0, (de = readdir(dir)) != NULL) + dir = opendir(pg_tblspc); + if (dir == NULL) + pg_log_error("could not open directory \"%s\": %m", + pg_tblspc); + else { - char subpath[MAXPGPATH * 2]; + while (errno = 0, (de = readdir(dir)) != NULL) + { + char subpath[MAXPGPATH * 2]; - if (strcmp(de->d_name, ".") == 0 || - strcmp(de->d_name, "..") == 0) - continue; + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; - snprintf(subpath, sizeof(subpath), "%s/%s", - pg_tblspc, de->d_name); - do_syncfs(subpath); - } + snprintf(subpath, sizeof(subpath), "%s/%s", + pg_tblspc, de->d_name); + do_syncfs(subpath); + } - if (errno) - pg_log_error("could not read directory \"%s\": %m", - pg_tblspc); + if (errno) + pg_log_error("could not read directory \"%s\": %m", + pg_tblspc); - (void) closedir(dir); + (void) closedir(dir); + } } /* If pg_wal is a symlink, process that too. */ @@ -182,15 +190,21 @@ sync_pgdata(const char *pg_data, case DATA_DIR_SYNC_METHOD_FSYNC: { + char *exclude_dir = NULL; + + if (!sync_data_files) + exclude_dir = psprintf("%s/base", pg_data); + /* * If possible, hint to the kernel that we're soon going to * fsync the data directory and its contents. */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(pg_data, pre_sync_fname, false); + walkdir(pg_data, pre_sync_fname, false, exclude_dir); if (xlog_is_symlink) - walkdir(pg_wal, pre_sync_fname, false); - walkdir(pg_tblspc, pre_sync_fname, true); + walkdir(pg_wal, pre_sync_fname, false, NULL); + if (sync_data_files) + walkdir(pg_tblspc, pre_sync_fname, true, NULL); #endif /* @@ -203,10 +217,14 @@ sync_pgdata(const char *pg_data, * get fsync'd twice. That's not an expected case so we don't * worry about optimizing it. */ - walkdir(pg_data, fsync_fname, false); + walkdir(pg_data, fsync_fname, false, exclude_dir); if (xlog_is_symlink) - walkdir(pg_wal, fsync_fname, false); - walkdir(pg_tblspc, fsync_fname, true); + walkdir(pg_wal, fsync_fname, false, NULL); + if (sync_data_files) + walkdir(pg_tblspc, fsync_fname, true, NULL); + + if (exclude_dir) + pfree(exclude_dir); } break; } @@ -245,10 +263,10 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) * fsync the data directory and its contents. */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(dir, pre_sync_fname, false); + walkdir(dir, pre_sync_fname, false, NULL); #endif - walkdir(dir, fsync_fname, false); + walkdir(dir, fsync_fname, false, NULL); } break; } @@ -264,6 +282,9 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) * ignored in subdirectories, ie we intentionally don't pass down the * process_symlinks flag to recursive calls. * + * If exclude_dir is not NULL, it specifies a directory path to skip + * processing. + * * Errors are reported but not considered fatal. * * See also walkdir in fd.c, which is a backend version of this logic. @@ -271,11 +292,15 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), - bool process_symlinks) + bool process_symlinks, + const char *exclude_dir) { DIR *dir; struct dirent *de; + if (exclude_dir && strcmp(exclude_dir, path) == 0) + return; + dir = opendir(path); if (dir == NULL) { @@ -299,7 +324,7 @@ walkdir(const char *path, (*action) (subpath, false); break; case PGFILETYPE_DIR: - walkdir(subpath, action, false); + walkdir(subpath, action, false, exclude_dir); break; default: diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index a832210adc1..8274bc877ab 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -35,7 +35,7 @@ struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND extern int fsync_fname(const char *fname, bool isdir); extern void sync_pgdata(const char *pg_data, int serverVersion, - DataDirSyncMethod sync_method); + DataDirSyncMethod sync_method, bool sync_data_files); extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); -- 2.39.5 (Apple Git-154)
>From d344dfcc9b96253702025e551ee3e8dd720bb0d6 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 19 Feb 2025 11:25:28 -0600 Subject: [PATCH v3 2/4] pg_dump: Add --sequence-data. This new option instructs pg_dump to dump sequence data when the --no-data, --schema-only, or --statistics-only option is specified. This was originally considered for commit a7e5457db8, but it was left out at that time because there was no known use-case. A follow-up commit will use this to optimize pg_upgrade's file transfer step. Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan --- doc/src/sgml/ref/pg_dump.sgml | 11 +++++++++++ src/bin/pg_dump/pg_dump.c | 10 ++-------- src/bin/pg_dump/t/002_pg_dump.pl | 1 + src/bin/pg_upgrade/dump.c | 2 +- src/test/modules/test_pg_dump/t/001_base.pl | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 1975054d7bf..b05f16995c3 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -1289,6 +1289,17 @@ PostgreSQL documentation </listitem> </varlistentry> + <varlistentry> + <term><option>--sequence-data</option></term> + <listitem> + <para> + Include sequence data in the dump. This is the default behavior except + when <option>--no-data</option>, <option>--schema-only</option>, or + <option>--statistics-only</option> is specified. + </para> + </listitem> + </varlistentry> + <varlistentry> <term><option>--serializable-deferrable</option></term> <listitem> diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 4f4ad2ee150..f63215eb3f9 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -517,6 +517,7 @@ main(int argc, char **argv) {"sync-method", required_argument, NULL, 15}, {"filter", required_argument, NULL, 16}, {"exclude-extension", required_argument, NULL, 17}, + {"sequence-data", no_argument, &dopt.sequence_data, 1}, {NULL, 0, NULL, 0} }; @@ -803,14 +804,6 @@ main(int argc, char **argv) if (dopt.column_inserts && dopt.dump_inserts == 0) dopt.dump_inserts = DUMP_DEFAULT_ROWS_PER_INSERT; - /* - * Binary upgrade mode implies dumping sequence data even in schema-only - * mode. This is not exposed as a separate option, but kept separate - * internally for clarity. - */ - if (dopt.binary_upgrade) - dopt.sequence_data = 1; - if (data_only && schema_only) pg_fatal("options -s/--schema-only and -a/--data-only cannot be used together"); if (schema_only && statistics_only) @@ -1275,6 +1268,7 @@ help(const char *progname) printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n")); printf(_(" --rows-per-insert=NROWS number of rows per INSERT; implies --inserts\n")); printf(_(" --section=SECTION dump named section (pre-data, data, or post-data)\n")); + printf(_(" --sequence-data include sequence data in dump\n")); printf(_(" --serializable-deferrable wait until the dump can run without anomalies\n")); printf(_(" --snapshot=SNAPSHOT use given snapshot for the dump\n")); printf(_(" --statistics-only dump only the statistics, not schema or data\n")); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index c7bffc1b045..8ae6c5374fc 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -66,6 +66,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/binary_upgrade.dump", '--no-password', '--no-data', + '--sequence-data', '--binary-upgrade', '--dbname' => 'postgres', # alternative way to specify database ], diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index 23fe7280a16..b8fd0d0acee 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -52,7 +52,7 @@ generate_old_dump(void) snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid); parallel_exec_prog(log_file_name, NULL, - "\"%s/pg_dump\" %s --no-data %s --quote-all-identifiers " + "\"%s/pg_dump\" %s --no-data %s --sequence-data --quote-all-identifiers " "--binary-upgrade --format=custom %s --no-sync --file=\"%s/%s\" %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), log_opts.verbose ? "--verbose" : "", diff --git a/src/test/modules/test_pg_dump/t/001_base.pl b/src/test/modules/test_pg_dump/t/001_base.pl index 9b2a90b0469..27c6c2ab0f3 100644 --- a/src/test/modules/test_pg_dump/t/001_base.pl +++ b/src/test/modules/test_pg_dump/t/001_base.pl @@ -48,7 +48,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', "--file=$tempdir/binary_upgrade.sql", '--schema-only', - '--binary-upgrade', '--dbname=postgres', + '--sequence-data', '--binary-upgrade', '--dbname=postgres', ], }, clean => { -- 2.39.5 (Apple Git-154)
>From 04063a995759c9f32bd87b0155c68a2c5fb346ed Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 26 Feb 2025 11:44:36 -0600 Subject: [PATCH v3 3/4] Add new frontend functions for durable file operations. This commit exports the existing pre_sync_fname() function and adds durable_mkdir_p() and durable_rename_dir() for use in frontend programs. A follow-up commit will use this to help optimize pg_upgrade's file transfer step. Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan --- src/common/file_utils.c | 55 +++++++++++++++++++++++++++------ src/include/common/file_utils.h | 3 ++ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 78e272916f5..a5a03abd7ca 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -26,6 +26,7 @@ #include "common/file_utils.h" #ifdef FRONTEND +#include "common/file_perm.h" #include "common/logging.h" #endif #include "common/relpath.h" @@ -45,9 +46,6 @@ */ #define MINIMUM_VERSION_FOR_PG_WAL 100000 -#ifdef PG_FLUSH_DATA_WORKS -static int pre_sync_fname(const char *fname, bool isdir); -#endif static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), bool process_symlinks, @@ -352,16 +350,16 @@ walkdir(const char *path, } /* - * Hint to the OS that it should get ready to fsync() this file. + * Hint to the OS that it should get ready to fsync() this file, if supported + * by the platform. * * Ignores errors trying to open unreadable files, and reports other errors * non-fatally. */ -#ifdef PG_FLUSH_DATA_WORKS - -static int +int pre_sync_fname(const char *fname, bool isdir) { +#ifdef PG_FLUSH_DATA_WORKS int fd; fd = open(fname, O_RDONLY | PG_BINARY, 0); @@ -388,11 +386,10 @@ pre_sync_fname(const char *fname, bool isdir) #endif (void) close(fd); +#endif /* PG_FLUSH_DATA_WORKS */ return 0; } -#endif /* PG_FLUSH_DATA_WORKS */ - /* * fsync_fname -- Try to fsync a file or directory * @@ -539,6 +536,46 @@ durable_rename(const char *oldfile, const char *newfile) return 0; } +/* + * durable_rename_dir: rename(2) wrapper for directories, issuing fsyncs + * required for durability. + */ +int +durable_rename_dir(const char *olddir, const char *newdir) +{ + if (fsync_fname(olddir, true) != 0 || + fsync_parent_path(olddir) != 0 || + fsync_parent_path(newdir) != 0) + return -1; + + if (rename(olddir, newdir) != 0) + return -1; + + if (fsync_fname(newdir, true) != 0 || + fsync_parent_path(olddir) != 0 || + fsync_parent_path(newdir) != 0) + return -1; + + return 0; +} + +/* + * durable_mkdir_p: pg_mkdir_p() wrapper, issuing fsyncs required for + * durability. + */ +int +durable_mkdir_p(char *newdir) +{ + if (pg_mkdir_p(newdir, pg_dir_create_mode) && errno != EEXIST) + return -1; + + if (fsync_fname(newdir, true) != 0 || + fsync_parent_path(newdir) != 0) + return -1; + + return 0; +} + #endif /* FRONTEND */ /* diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 8274bc877ab..7d253a4cb51 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -33,11 +33,14 @@ typedef enum DataDirSyncMethod struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND +extern int pre_sync_fname(const char *fname, bool isdir); extern int fsync_fname(const char *fname, bool isdir); extern void sync_pgdata(const char *pg_data, int serverVersion, DataDirSyncMethod sync_method, bool sync_data_files); extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); +extern int durable_rename_dir(const char *olddir, const char *newdir); +extern int durable_mkdir_p(char *newdir); extern int fsync_parent_path(const char *fname); #endif -- 2.39.5 (Apple Git-154)
>From c8540d235c0dc6cac817a3b9f3336c3336af5886 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 28 Feb 2025 13:00:50 -0600 Subject: [PATCH v3 4/4] pg_upgrade: Add --swap for faster file transfer. This new option instructs pg_upgrade to move the data directories from the old cluster to the new cluster and then to replace the catalog files with those generated for the new cluster. This mode can outperform --link, --clone, --copy, and --copy-file-range, especially on clusters with many relations. However, this mode creates many garbage files in the old cluster, which can prolong the file synchronization step. To handle that, we use "initdb --sync-only --no-sync-data-files" for file synchronization, and we synchronize the catalog files as they are transferred. We assume that the database files transferred from the old cluster were synchronized prior to upgrade. This mode also complicates reverting to the old cluster. For this reason, pg_upgrade generates a script to perform the necessary steps. The new mode is limited to clusters located in the same file system and to upgrades from version 10 and newer. Discussion: https://postgr.es/m/Zyvop-LxLXBLrZil%40nathan --- doc/src/sgml/ref/pgupgrade.sgml | 69 +++++- src/bin/pg_upgrade/.gitignore | 2 + src/bin/pg_upgrade/Makefile | 2 +- src/bin/pg_upgrade/check.c | 82 ++++++- src/bin/pg_upgrade/dump.c | 4 +- src/bin/pg_upgrade/file.c | 16 +- src/bin/pg_upgrade/info.c | 4 +- src/bin/pg_upgrade/option.c | 7 + src/bin/pg_upgrade/pg_upgrade.c | 4 +- src/bin/pg_upgrade/pg_upgrade.h | 4 +- src/bin/pg_upgrade/relfilenumber.c | 374 +++++++++++++++++++++++++++++ 11 files changed, 558 insertions(+), 10 deletions(-) diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 7bdd85c5cff..6ca20f19ec2 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -244,7 +244,8 @@ PostgreSQL documentation <listitem> <para> Copy files to the new cluster. This is the default. (See also - <option>--link</option> and <option>--clone</option>.) + <option>--link</option>, <option>--clone</option>, + <option>--copy-file-range</option>, and <option>--swap</option>.) </para> </listitem> </varlistentry> @@ -262,6 +263,33 @@ PostgreSQL documentation </listitem> </varlistentry> + <varlistentry> + <term><option>--swap</option></term> + <listitem> + <para> + Move the data directories from the old cluster to the new cluster. + Then, replace the catalog files with those generated for the new + cluster. This mode can outperform <option>--link</option>, + <option>--clone</option>, <option>--copy</option>, and + <option>--copy-file-range</option>, especially on clusters with many + relations. + </para> + <para> + However, this mode creates many garbage files in the old cluster, which + can prolong the file synchronization step if + <option>--sync-method=syncfs</option> is used. Therefore, it is + recommended to use <option>--sync-method=fsync</option> with + <option>--swap</option>. + </para> + <para> + Additionally, this mode complicates reverting to the old cluster. For + this reason, <application>pg_upgrade</application> generates a script + to perform the necessary steps. See + <xref linkend="pgupgrade-step-revert"/> for details. + </para> + </listitem> + </varlistentry> + <varlistentry> <term><option>--sync-method=</option><replaceable>method</replaceable></term> <listitem> @@ -530,6 +558,10 @@ NET STOP postgresql-&majorversion; is started. Clone mode also requires that the old and new data directories be in the same file system. This mode is only available on certain operating systems and file systems. + Swap mode may be the fastest if there are many relations, but like link + mode, you will not be able to access your old cluster once you start the + new cluster after the upgrade. Swap mode also requires that the old and + new cluster data directories be in the same file system. </para> <para> @@ -889,6 +921,41 @@ psql --username=postgres --file=script.sql postgres </itemizedlist></para> </listitem> + + <listitem> + <para> + If the <option>--swap</option> option was used, the data directories + and their files might be moved between the old and new clusters: + + <itemizedlist> + <listitem> + <para> + If <command>pg_upgrade</command> aborted before moving any data + directories or their files, the old cluster was unmodified; it can + be restarted. + </para> + </listitem> + + <listitem> + <para> + If you did <emphasis>not</emphasis> start the new cluster, the + content of the database files was unmodified, but the data + directories and their files were moved between the old and new + clusters. To reuse the old cluster, run the script mentioned before + <command>pg_upgrade</command> started the file transfer step. + </para> + </listitem> + + <listitem> + <para> + If you did start the new cluster, it has written to the files, and + it is unsafe to use the old cluster. The old cluster will need to be + restored from backup in this case. + </para> + </listitem> + </itemizedlist> + </para> + </listitem> </itemizedlist></para> </step> </procedure> diff --git a/src/bin/pg_upgrade/.gitignore b/src/bin/pg_upgrade/.gitignore index a66166ea0fa..ea3a0046e51 100644 --- a/src/bin/pg_upgrade/.gitignore +++ b/src/bin/pg_upgrade/.gitignore @@ -3,6 +3,8 @@ /delete_old_cluster.sh /delete_old_cluster.bat /reindex_hash.sql +/revert_to_old_cluster.sh +/revert_to_old_cluster.bat # Generated by test suite /log/ /tmp_check/ diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index f83d2b5d309..67ac34443af 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -53,7 +53,7 @@ uninstall: clean distclean: rm -f pg_upgrade$(X) $(OBJS) rm -rf delete_old_cluster.sh log/ tmp_check/ \ - reindex_hash.sql + reindex_hash.sql revert_to_old_cluster.sh export with_icu diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 88db8869b6e..9d27097ad94 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -709,7 +709,34 @@ check_new_cluster(void) check_copy_file_range(); break; case TRANSFER_MODE_LINK: - check_hard_link(); + check_hard_link(TRANSFER_MODE_LINK); + break; + case TRANSFER_MODE_SWAP: + + /* + * We do the hard link check for --swap, too, since it's an easy + * way to verify the clusters are in the same file system. This + * allows us to take some shortcuts in the file synchronization + * step. With some more effort, we could probably support the + * separate-file-system use case, but this mode is unlikely to + * offer much benefit if we have to copy the files across file + * system boundaries. + */ + check_hard_link(TRANSFER_MODE_SWAP); + + /* + * There are a few known issues with using --swap to upgrade from + * versions older than 10. For example, the sequence tuple format + * changed in v10, and the visibility map format changed in 9.6. + * While such problems are not insurmountable (and we may have to + * deal with similar problems in the future, anyway), it doesn't + * seem worth the effort to support swap mode for upgrades from + * long-unsupported versions. + */ + if (GET_MAJOR_VERSION(old_cluster.major_version) < 1000) + pg_fatal("Swap mode can only upgrade clusters from PostgreSQL version %s and later.", + "10"); + break; } @@ -928,6 +955,8 @@ check_for_new_tablespace_dir(void) * create_script_for_old_cluster_deletion() * * This is particularly useful for tablespace deletion. + * + * XXX: DO WE NEED TO MODIFY THIS FOR SWAP MODE? */ void create_script_for_old_cluster_deletion(char **deletion_script_file_name) @@ -1046,6 +1075,57 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name) } +/* + * create_script_for_swap_revert() + * + * Reverting to the old cluster when --swap is used is complicated, so we + * generate a script to make it easy. + */ +void +create_script_for_swap_revert(void) +{ + char *script; + FILE *fd; + + script = psprintf("%srevert_to_old_cluster.%s", SCRIPT_PREFIX, SCRIPT_EXT); + + prep_status("Creating script to revert to old cluster"); + + if ((fd = fopen_priv(script, "w")) == NULL) + pg_fatal("could not open file \"%s\": %m", script); + +#ifndef WIN32 + /* add shebang header */ + fprintf(fd, "#!/bin/sh\n\n"); +#endif + + /* handle default tablespace */ + /* TODO */ + + /* handle alternate tablespaces */ + for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + { + /* TODO */ + } + + fclose(fd); + +#ifndef WIN32 + if (chmod(script, S_IRWXU) != 0) + pg_fatal("could not add execute permission to file \"%s\": %m", script); +#endif + + check_ok(); + + /* report location of script to user */ + pg_log(PG_REPORT, "\n" + " To revert to the old cluster, run this script before\n" + " starting the new cluster:\n" + " %s", + script); +} + + /* * check_is_install_user() * diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index b8fd0d0acee..23cb08e8347 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -52,9 +52,11 @@ generate_old_dump(void) snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid); parallel_exec_prog(log_file_name, NULL, - "\"%s/pg_dump\" %s --no-data %s --sequence-data --quote-all-identifiers " + "\"%s/pg_dump\" %s --no-data %s %s --quote-all-identifiers " "--binary-upgrade --format=custom %s --no-sync --file=\"%s/%s\" %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + "" : "--sequence-data", log_opts.verbose ? "--verbose" : "", user_opts.do_statistics ? "" : "--no-statistics", log_opts.dumpdir, diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 7fd1991204a..4fe784e8b94 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -434,18 +434,28 @@ check_copy_file_range(void) } void -check_hard_link(void) +check_hard_link(transferMode transfer_mode) { char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; + /* only used for --link and --swap */ + Assert(transfer_mode == TRANSFER_MODE_LINK || + transfer_mode == TRANSFER_MODE_SWAP); + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata); unlink(new_link_file); /* might fail */ if (link(existing_file, new_link_file) < 0) - pg_fatal("could not create hard link between old and new data directories: %m\n" - "In link mode the old and new data directories must be on the same file system."); + { + if (transfer_mode == TRANSFER_MODE_LINK) + pg_fatal("could not create hard link between old and new data directories: %m\n" + "In link mode the old and new data directories must be on the same file system."); + else + pg_fatal("could not create hard link between old and new data directories: %m\n" + "In swap mode the old and new data directories must be on the same file system."); + } unlink(new_link_file); } diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index ad52de8b607..4b7a56f5b3b 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -490,7 +490,7 @@ get_rel_infos_query(void) " FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n " " ON c.relnamespace = n.oid " " WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", " - CppAsString2(RELKIND_MATVIEW) ") AND " + CppAsString2(RELKIND_MATVIEW) "%s) AND " /* exclude possible orphaned temp tables */ " ((n.nspname !~ '^pg_temp_' AND " " n.nspname !~ '^pg_toast_temp_' AND " @@ -499,6 +499,8 @@ get_rel_infos_query(void) " c.oid >= %u::pg_catalog.oid) OR " " (n.nspname = 'pg_catalog' AND " " relname IN ('pg_largeobject') ))), ", + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + ", " CppAsString2(RELKIND_SEQUENCE) : "", FirstNormalObjectId); /* diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 188dd8d8a8b..7fd7f1d33fc 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -62,6 +62,7 @@ parseCommandLine(int argc, char *argv[]) {"sync-method", required_argument, NULL, 4}, {"no-statistics", no_argument, NULL, 5}, {"set-char-signedness", required_argument, NULL, 6}, + {"swap", no_argument, NULL, 7}, {NULL, 0, NULL, 0} }; @@ -228,6 +229,11 @@ parseCommandLine(int argc, char *argv[]) else pg_fatal("invalid argument for option %s", "--set-char-signedness"); break; + + case 7: + user_opts.transfer_mode = TRANSFER_MODE_SWAP; + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), os_info.progname); @@ -325,6 +331,7 @@ usage(void) printf(_(" --no-statistics do not import statistics from old cluster\n")); printf(_(" --set-char-signedness=OPTION set new cluster char signedness to \"signed\" or\n" " \"unsigned\"\n")); + printf(_(" --swap move data directories to new cluster\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\n" diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 174cd920840..a538d407f74 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -212,8 +212,10 @@ main(int argc, char **argv) { prep_status("Sync data directory to disk"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/initdb\" --sync-only \"%s\" --sync-method %s", + "\"%s/initdb\" --sync-only %s \"%s\" --sync-method %s", new_cluster.bindir, + (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? + "--no-sync-data-files" : "", new_cluster.pgdata, user_opts.sync_method); check_ok(); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index f4e375d27c7..9403c0ac78f 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -262,6 +262,7 @@ typedef enum TRANSFER_MODE_COPY, TRANSFER_MODE_COPY_FILE_RANGE, TRANSFER_MODE_LINK, + TRANSFER_MODE_SWAP, } transferMode; /* @@ -385,6 +386,7 @@ void output_completion_banner(char *deletion_script_file_name); void check_cluster_versions(void); void check_cluster_compatibility(void); void create_script_for_old_cluster_deletion(char **deletion_script_file_name); +void create_script_for_swap_revert(void); /* controldata.c */ @@ -423,7 +425,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); void check_file_clone(void); void check_copy_file_range(void); -void check_hard_link(void); +void check_hard_link(transferMode transfer_mode); /* fopen_priv() is no longer different from fopen() */ #define fopen_priv(path, mode) fopen(path, mode) diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 8c23c583172..059ef98350f 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -11,11 +11,91 @@ #include <sys/stat.h> +#include "common/file_utils.h" +#include "common/int.h" +#include "common/logging.h" #include "pg_upgrade.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); +/* + * The following set of sync_queue_* functions are used for --swap to reduce + * the amount of time spent synchronizing the swapped catalog files. When a + * file is added to the queue, we also alert the file system that we'd like it + * to be persisted to disk in the near future (if that operation is supported + * by the current platform). Once the queue is full, all of the files are + * synchronized to disk. This strategy should generally be much faster than + * simply calling fsync() on the files right away. + * + * The general usage pattern should be something like: + * + * for (int i = 0; i < num_files; i++) + * sync_queue_push(files[i]); + * + * // be sure to sync any remaining files in the queue + * sync_queue_sync_all(); + * synq_queue_destroy(); + */ + +#define SYNC_QUEUE_MAX_LEN (1024) + +static char *sync_queue[SYNC_QUEUE_MAX_LEN]; +static bool sync_queue_inited; +static int sync_queue_len; + +static inline void +sync_queue_init(void) +{ + if (sync_queue_inited) + return; + + sync_queue_inited = true; + for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++) + sync_queue[i] = palloc(MAXPGPATH); +} + +static inline void +sync_queue_sync_all(void) +{ + if (!sync_queue_inited) + return; + + for (int i = 0; i < sync_queue_len; i++) + { + if (fsync_fname(sync_queue[i], false) != 0) + pg_fatal("could not synchronize file \"%s\": %m", sync_queue[i]); + } + + sync_queue_len = 0; +} + +static inline void +sync_queue_push(const char *fname) +{ + sync_queue_init(); + + pre_sync_fname(fname, false); + + strncpy(sync_queue[sync_queue_len++], fname, MAXPGPATH); + if (sync_queue_len >= SYNC_QUEUE_MAX_LEN) + sync_queue_sync_all(); +} + +static inline void +sync_queue_destroy(void) +{ + if (!sync_queue_inited) + return; + + sync_queue_inited = false; + sync_queue_len = 0; + for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++) + { + pfree(sync_queue[i]); + sync_queue[i] = NULL; + } +} /* * transfer_all_new_tablespaces() @@ -41,6 +121,17 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, case TRANSFER_MODE_LINK: prep_status_progress("Linking user relation files"); break; + case TRANSFER_MODE_SWAP: + + /* + * We generate the revert script for this mode before starting + * file transfer so that it can be used in the case of a crash + * halfway through. + */ + create_script_for_swap_revert(); + + prep_status_progress("Swapping data directories"); + break; } /* @@ -125,6 +216,271 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, /* We allocate something even for n_maps == 0 */ pg_free(mappings); } + + /* + * Make sure anything pending synchronization in swap mode is fully + * persisted to disk. This is a no-op for other transfer modes. + */ + sync_queue_sync_all(); + sync_queue_destroy(); +} + +/* + * prepare_for_swap() + * + * This function durably moves the database directories from the old cluster to + * the new cluster in preparation for moving the pg_restore-generated catalog + * files into place. Returns false if the database with the given OID does not + * have a directory in the given tablespace, otherwise returns true. + */ +static bool +prepare_for_swap(const char *old_tablespace, Oid db_oid, + char *old_cat, char *new_dat, char *moved_dat) +{ + const char *new_tablespace; + const char *old_tblspc_suffix; + const char *new_tblspc_suffix; + char old_tblspc[MAXPGPATH]; + char new_tblspc[MAXPGPATH]; + char moved_tblspc[MAXPGPATH]; + char old_dat[MAXPGPATH]; + struct stat st; + + if (strcmp(old_tablespace, old_cluster.pgdata) == 0) + { + new_tablespace = new_cluster.pgdata; + new_tblspc_suffix = "/base"; + old_tblspc_suffix = "/base"; + } + else + { + new_tablespace = old_tablespace; + new_tblspc_suffix = new_cluster.tablespace_suffix; + old_tblspc_suffix = old_cluster.tablespace_suffix; + } + + snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", old_tablespace, old_tblspc_suffix); + snprintf(moved_tblspc, sizeof(moved_tblspc), "%s_moved", old_tblspc); + snprintf(old_cat, MAXPGPATH, "%s/%u_old_cat", moved_tblspc, db_oid); + snprintf(new_tblspc, sizeof(new_tblspc), "%s%s", new_tablespace, new_tblspc_suffix); + snprintf(new_dat, MAXPGPATH, "%s/%u", new_tblspc, db_oid); + snprintf(moved_dat, MAXPGPATH, "%s/%u", moved_tblspc, db_oid); + snprintf(old_dat, sizeof(old_dat), "%s/%u", old_tblspc, db_oid); + + /* Check that the database directory exists in the given tablespace. */ + if (stat(old_dat, &st) != 0) + { + if (errno != ENOENT) + pg_fatal("could not stat file \"%s\": %m", old_dat); + return false; + } + + /* Create directory for stuff that is moved aside. */ + if (durable_mkdir_p(moved_tblspc) != 0) + pg_fatal("could not create directory \"%s\"", moved_tblspc); + + /* Create directory for old catalog files. */ + if (durable_mkdir_p(old_cat) != 0) + pg_fatal("could not create directory \"%s\"", old_cat); + + /* Move the new cluster's database directory aside. */ + if (durable_rename_dir(new_dat, moved_dat) != 0) + pg_fatal("could not rename \"%s\" to \"%s\"", new_dat, moved_dat); + + /* Move the old cluster's database directory into place. */ + if (durable_rename_dir(old_dat, new_dat) != 0) + pg_fatal("could not rename \"%s\" to \"%s\"", old_dat, new_dat); + + return true; +} + +/* + * FileNameMapCmp() + * + * qsort() comparator for FileNameMap that sorts by RelFileNumber. + */ +static int +FileNameMapCmp(const void *a, const void *b) +{ + const FileNameMap *map1 = (const FileNameMap *) a; + const FileNameMap *map2 = (const FileNameMap *) b; + + return pg_cmp_u32(map1->relfilenumber, map2->relfilenumber); +} + +/* + * parse_relfilenumber() + * + * Attempt to parse the RelFileNumber of the given file name. If we can't, + * return InvalidRelFileNumber. + */ +static RelFileNumber +parse_relfilenumber(const char *filename) +{ + char *endp; + unsigned long n; + + if (filename[0] < '1' || filename[0] > '9') + return InvalidRelFileNumber; + + errno = 0; + n = strtoul(filename, &endp, 10); + if (errno || filename == endp || n <= 0 || n > PG_UINT32_MAX) + return InvalidRelFileNumber; + + return (RelFileNumber) n; +} + +/* + * swap_catalog_files() + * + * Moves the old catalog files aside, and moves the new catalog files into + * place. + */ +static void +swap_catalog_files(FileNameMap *maps, int size, const char *old_cat, + const char *new_dat, const char *moved_dat) +{ + DIR *dir; + struct dirent *de; + char path[MAXPGPATH]; + char dest[MAXPGPATH]; + RelFileNumber rfn; + + /* + * Move the old catalog files aside. + */ + dir = opendir(new_dat); + if (dir == NULL) + pg_fatal("could not open directory \"%s\": %m", new_dat); + while (errno = 0, (de = readdir(dir)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", new_dat, de->d_name); + if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG) + continue; + + rfn = parse_relfilenumber(de->d_name); + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key; + + key.relfilenumber = (RelFileNumber) rfn; + if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp)) + continue; + } + + snprintf(dest, sizeof(dest), "%s/%s", old_cat, de->d_name); + if (rename(path, dest) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + } + + if (errno) + pg_fatal("could not read directory \"%s\": %m", new_dat); + (void) closedir(dir); + + /* + * Move the new catalog files into place. + */ + dir = opendir(moved_dat); + if (dir == NULL) + pg_fatal("could not open directory \"%s\": %m", moved_dat); + while (errno = 0, (de = readdir(dir)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "%s/%s", moved_dat, de->d_name); + if (get_dirent_type(path, de, false, PG_LOG_ERROR) != PGFILETYPE_REG) + continue; + + rfn = parse_relfilenumber(de->d_name); + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key; + + key.relfilenumber = (RelFileNumber) rfn; + if (bsearch(&key, maps, size, sizeof(FileNameMap), FileNameMapCmp)) + continue; + } + + snprintf(dest, sizeof(dest), "%s/%s", new_dat, de->d_name); + if (rename(path, dest) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + + /* + * We don't fsync() the database files in the file synchronization + * stage of pg_upgrade in swap mode, so we need to synchronize them + * ourselves. We only do this for the catalog files because they were + * created during pg_restore with fsync=off. We assume that the user + * data files files were properly persisted to disk when the user last + * shut it down. + */ + sync_queue_push(dest); + } + + if (errno) + pg_fatal("could not read directory \"%s\": %m", moved_dat); + (void) closedir(dir); + + /* + * Ensure the directory entries are persisted to disk. + */ + if (fsync_fname(old_cat, true) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", old_cat); + if (fsync_fname(new_dat, true) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", new_dat); + if (fsync_fname(moved_dat, true) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", moved_dat); +} + +/* + * do_swap() + * + * Perform the required steps for --swap for a single database. In short this + * moves the old cluster's database directory into the new cluster and then + * replaces any files for system catalogs with the ones that were generated + * during pg_restore. + */ +static void +do_swap(FileNameMap *maps, int size, char *old_tablespace) +{ + char old_cat[MAXPGPATH]; + char new_dat[MAXPGPATH]; + char moved_dat[MAXPGPATH]; + + /* + * We perform many lookups on maps by relfilenumber in swap mode, so make + * sure it's sorted. + */ + qsort(maps, size, sizeof(FileNameMap), FileNameMapCmp); + + /* + * If an old tablespace is given, we only need to process that one. If no + * old tablespace is specified, we need to process all the tablespaces on + * the system. + */ + if (old_tablespace) + { + if (prepare_for_swap(old_tablespace, maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + } + else + { + if (prepare_for_swap(old_cluster.pgdata, maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + + for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + { + if (prepare_for_swap(os_info.old_tablespaces[tblnum], maps[0].db_oid, + old_cat, new_dat, moved_dat)) + swap_catalog_files(maps, size, old_cat, new_dat, moved_dat); + } + } } /* @@ -145,6 +501,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) vm_must_add_frozenbit = true; + /* --swap has its own subroutine */ + if (user_opts.transfer_mode == TRANSFER_MODE_SWAP) + { + /* + * We don't support --swap to upgrade from versions that require + * rewriting the visibility map. We should've failed already if + * someone tries to do that. + */ + Assert(!vm_must_add_frozenbit); + + do_swap(maps, size, old_tablespace); + return; + } + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || @@ -259,6 +629,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file); linkFile(old_file, new_file, map->nspname, map->relname); + case TRANSFER_MODE_SWAP: + /* swap mode is handled in its own code path */ + pg_fatal("should never happen"); + break; } } } -- 2.39.5 (Apple Git-154)