Here is a rebased patch set for cfbot. I'm planning to spend some time getting these patches into a more reviewable state in the near future.
-- nathan
>From 81fe66e0f0aa4f958a8707df669f60756c89bb85 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Tue, 5 Nov 2024 15:59:51 -0600 Subject: [PATCH v2 1/8] Export walkdir(). THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. A follow-up commit will use this function to swap catalog files between database directories during pg_upgrade. --- src/common/file_utils.c | 5 +---- src/include/common/file_utils.h | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 398fe1c334..3f488bf5ec 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -48,9 +48,6 @@ #ifdef PG_FLUSH_DATA_WORKS static int pre_sync_fname(const char *fname, bool isdir); #endif -static void walkdir(const char *path, - int (*action) (const char *fname, bool isdir), - bool process_symlinks); #ifdef HAVE_SYNCFS @@ -268,7 +265,7 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) * * See also walkdir in fd.c, which is a backend version of this logic. */ -static void +void walkdir(const char *path, int (*action) (const char *fname, bool isdir), bool process_symlinks) diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index e4339fb7b6..5a9519acfe 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -39,6 +39,9 @@ extern void sync_pgdata(const char *pg_data, int serverVersion, extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); +extern void walkdir(const char *path, + int (*action) (const char *fname, bool isdir), + bool process_symlinks); #endif extern PGFileType get_dirent_type(const char *path, -- 2.39.5 (Apple Git-154)
>From 36d6a1aad5cbfeb05954886bb336cfa9ec01c5c3 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 6 Nov 2024 13:59:39 -0600 Subject: [PATCH v2 2/8] Add "void *arg" parameter to walkdir() that is passed to function. THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. This will be used in follow up commits to pass private state to the functions called by walkdir(). --- src/bin/pg_basebackup/walmethods.c | 8 +++---- src/bin/pg_dump/pg_backup_custom.c | 2 +- src/bin/pg_dump/pg_backup_tar.c | 2 +- src/bin/pg_dump/pg_dumpall.c | 2 +- src/common/file_utils.c | 38 +++++++++++++++--------------- src/include/common/file_utils.h | 6 ++--- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c index 215b24597f..51640cb493 100644 --- a/src/bin/pg_basebackup/walmethods.c +++ b/src/bin/pg_basebackup/walmethods.c @@ -251,7 +251,7 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname, */ if (wwmethod->sync) { - if (fsync_fname(tmppath, false) != 0 || + if (fsync_fname(tmppath, false, NULL) != 0 || fsync_parent_path(tmppath) != 0) { wwmethod->lasterrno = errno; @@ -486,7 +486,7 @@ dir_close(Walfile *f, WalCloseMethod method) */ if (f->wwmethod->sync) { - r = fsync_fname(df->fullpath, false); + r = fsync_fname(df->fullpath, false, NULL); if (r == 0) r = fsync_parent_path(df->fullpath); } @@ -617,7 +617,7 @@ dir_finish(WalWriteMethod *wwmethod) * Files are fsynced when they are closed, but we need to fsync the * directory entry here as well. */ - if (fsync_fname(dir_data->basedir, true) != 0) + if (fsync_fname(dir_data->basedir, true, NULL) != 0) { wwmethod->lasterrno = errno; return false; @@ -1321,7 +1321,7 @@ tar_finish(WalWriteMethod *wwmethod) if (wwmethod->sync) { - if (fsync_fname(tar_data->tarfilename, false) != 0 || + if (fsync_fname(tar_data->tarfilename, false, NULL) != 0 || fsync_parent_path(tar_data->tarfilename) != 0) { wwmethod->lasterrno = errno; diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index e44b887eb2..51edf147d6 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -767,7 +767,7 @@ _CloseArchive(ArchiveHandle *AH) /* Sync the output file if one is defined */ if (AH->dosync && AH->mode == archModeWrite && AH->fSpec) - (void) fsync_fname(AH->fSpec, false); + (void) fsync_fname(AH->fSpec, false, NULL); AH->FH = NULL; } diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index b5ba3b46dd..5ea6a472d4 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -847,7 +847,7 @@ _CloseArchive(ArchiveHandle *AH) /* Sync the output file if one is defined */ if (AH->dosync && AH->fSpec) - (void) fsync_fname(AH->fSpec, false); + (void) fsync_fname(AH->fSpec, false, NULL); } AH->FH = NULL; diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 9a04e51c81..58a9f6e748 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -621,7 +621,7 @@ main(int argc, char *argv[]) /* sync the resulting file, errors are not fatal */ if (dosync) - (void) fsync_fname(filename, false); + (void) fsync_fname(filename, false, NULL); } exit_nicely(0); diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 3f488bf5ec..dc90f35ae1 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -46,7 +46,7 @@ #define MINIMUM_VERSION_FOR_PG_WAL 100000 #ifdef PG_FLUSH_DATA_WORKS -static int pre_sync_fname(const char *fname, bool isdir); +static int pre_sync_fname(const char *fname, bool isdir, void *arg); #endif #ifdef HAVE_SYNCFS @@ -184,10 +184,10 @@ sync_pgdata(const char *pg_data, * fsync the data directory and its contents. */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(pg_data, pre_sync_fname, false); + walkdir(pg_data, pre_sync_fname, false, NULL); if (xlog_is_symlink) - walkdir(pg_wal, pre_sync_fname, false); - walkdir(pg_tblspc, pre_sync_fname, true); + walkdir(pg_wal, pre_sync_fname, false, NULL); + walkdir(pg_tblspc, pre_sync_fname, true, NULL); #endif /* @@ -200,10 +200,10 @@ sync_pgdata(const char *pg_data, * get fsync'd twice. That's not an expected case so we don't * worry about optimizing it. */ - walkdir(pg_data, fsync_fname, false); + walkdir(pg_data, fsync_fname, false, NULL); if (xlog_is_symlink) - walkdir(pg_wal, fsync_fname, false); - walkdir(pg_tblspc, fsync_fname, true); + walkdir(pg_wal, fsync_fname, false, NULL); + walkdir(pg_tblspc, fsync_fname, true, NULL); } break; } @@ -242,10 +242,10 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) * fsync the data directory and its contents. */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(dir, pre_sync_fname, false); + walkdir(dir, pre_sync_fname, false, NULL); #endif - walkdir(dir, fsync_fname, false); + walkdir(dir, fsync_fname, false, NULL); } break; } @@ -267,8 +267,8 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) */ void walkdir(const char *path, - int (*action) (const char *fname, bool isdir), - bool process_symlinks) + int (*action) (const char *fname, bool isdir, void *arg), + bool process_symlinks, void *arg) { DIR *dir; struct dirent *de; @@ -293,10 +293,10 @@ walkdir(const char *path, switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR)) { case PGFILETYPE_REG: - (*action) (subpath, false); + (*action) (subpath, false, arg); break; case PGFILETYPE_DIR: - walkdir(subpath, action, false); + walkdir(subpath, action, false, arg); break; default: @@ -320,7 +320,7 @@ walkdir(const char *path, * synced. Recent versions of ext4 have made the window much wider but * it's been an issue for ext3 and other filesystems in the past. */ - (*action) (path, true); + (*action) (path, true, arg); } /* @@ -332,7 +332,7 @@ walkdir(const char *path, #ifdef PG_FLUSH_DATA_WORKS static int -pre_sync_fname(const char *fname, bool isdir) +pre_sync_fname(const char *fname, bool isdir, void *arg) { int fd; @@ -373,7 +373,7 @@ pre_sync_fname(const char *fname, bool isdir) * are fatal. */ int -fsync_fname(const char *fname, bool isdir) +fsync_fname(const char *fname, bool isdir, void *arg) { int fd; int flags; @@ -444,7 +444,7 @@ fsync_parent_path(const char *fname) if (strlen(parentpath) == 0) strlcpy(parentpath, ".", MAXPGPATH); - if (fsync_fname(parentpath, true) != 0) + if (fsync_fname(parentpath, true, NULL) != 0) return -1; return 0; @@ -467,7 +467,7 @@ durable_rename(const char *oldfile, const char *newfile) * because it's then guaranteed that either source or target file exists * after a crash. */ - if (fsync_fname(oldfile, false) != 0) + if (fsync_fname(oldfile, false, NULL) != 0) return -1; fd = open(newfile, PG_BINARY | O_RDWR, 0); @@ -502,7 +502,7 @@ durable_rename(const char *oldfile, const char *newfile) * To guarantee renaming the file is persistent, fsync the file with its * new name, and its containing directory. */ - if (fsync_fname(newfile, false) != 0) + if (fsync_fname(newfile, false, NULL) != 0) return -1; if (fsync_parent_path(newfile) != 0) diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 5a9519acfe..c328f56a85 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -33,15 +33,15 @@ typedef enum DataDirSyncMethod struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND -extern int fsync_fname(const char *fname, bool isdir); +extern int fsync_fname(const char *fname, bool isdir, void *arg); extern void sync_pgdata(const char *pg_data, int serverVersion, DataDirSyncMethod sync_method); extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); extern void walkdir(const char *path, - int (*action) (const char *fname, bool isdir), - bool process_symlinks); + int (*action) (const char *fname, bool isdir, void *arg), + bool process_symlinks, void *arg); #endif extern PGFileType get_dirent_type(const char *path, -- 2.39.5 (Apple Git-154)
>From f5eca13b8b04760977ab41ef9cd023a47e5cbbbd Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Tue, 5 Nov 2024 16:38:19 -0600 Subject: [PATCH v2 3/8] Introduce catalog-swap mode for pg_upgrade. THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. This new mode moves the database directories from the old cluster to the new cluster and then swaps the pg_restore-generated catalog files in place. This can significantly increase the length of the following data synchronization step (due to the large number of unsynchronized pg_restore-generated files), but this problem will be handled in follow-up commits. --- src/bin/pg_upgrade/check.c | 2 + src/bin/pg_upgrade/option.c | 5 + src/bin/pg_upgrade/pg_upgrade.h | 1 + src/bin/pg_upgrade/relfilenumber.c | 167 +++++++++++++++++++++++++++++ src/tools/pgindent/typedefs.list | 1 + 5 files changed, 176 insertions(+) diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 94164f0472..a4bb365718 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -711,6 +711,8 @@ check_new_cluster(void) case TRANSFER_MODE_LINK: check_hard_link(); break; + case TRANSFER_MODE_CATALOG_SWAP: + break; } check_is_install_user(&new_cluster); diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 6f41d63eed..64091a54c4 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -60,6 +60,7 @@ parseCommandLine(int argc, char *argv[]) {"copy", no_argument, NULL, 2}, {"copy-file-range", no_argument, NULL, 3}, {"sync-method", required_argument, NULL, 4}, + {"catalog-swap", no_argument, NULL, 5}, {NULL, 0, NULL, 0} }; @@ -212,6 +213,10 @@ parseCommandLine(int argc, char *argv[]) user_opts.sync_method = pg_strdup(optarg); break; + case 5: + user_opts.transfer_mode = TRANSFER_MODE_CATALOG_SWAP; + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), os_info.progname); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 53f693c2d4..19cb5a011e 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -256,6 +256,7 @@ typedef enum TRANSFER_MODE_COPY, TRANSFER_MODE_COPY_FILE_RANGE, TRANSFER_MODE_LINK, + TRANSFER_MODE_CATALOG_SWAP, } transferMode; /* diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 07baa49a02..9d8fce3c4a 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -11,11 +11,21 @@ #include <sys/stat.h> +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/int.h" +#include "fe_utils/option_utils.h" #include "pg_upgrade.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); +typedef struct move_catalog_file_context +{ + FileNameMap *maps; + int size; + char *target; +} move_catalog_file_context; /* * transfer_all_new_tablespaces() @@ -41,6 +51,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, case TRANSFER_MODE_LINK: prep_status_progress("Linking user relation files"); break; + case TRANSFER_MODE_CATALOG_SWAP: + prep_status_progress("Swapping catalog files"); + break; } /* @@ -127,6 +140,144 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, } } +static int +FileNameMapCmp(const void *a, const void *b) +{ + return pg_cmp_u32(((const FileNameMap *) a)->relfilenumber, + ((const FileNameMap *) b)->relfilenumber); +} + +static RelFileNumber +parse_relfilenumber(const char *filename) +{ + char *endp; + unsigned long n; + + if (filename[0] < '1' || filename[0] > '9') + return InvalidRelFileNumber; + + errno = 0; + n = strtoul(filename, &endp, 10); + if (errno || filename == endp || n <= 0 || n > PG_UINT32_MAX) + return InvalidRelFileNumber; + + return (RelFileNumber) n; +} + +static int +move_catalog_file(const char *fname, bool isdir, void *arg) +{ + char dst[MAXPGPATH]; + const char *filename = last_dir_separator(fname) + 1; + RelFileNumber rfn = parse_relfilenumber(filename); + move_catalog_file_context *context = (move_catalog_file_context *) arg; + + /* + * XXX: Is this right? AFAICT we don't really expect there to be + * directories within database directories, so perhaps it would be better + * to either unconditionally rename or to fail. Further investigation is + * required. + */ + if (isdir) + return 0; + + if (RelFileNumberIsValid(rfn)) + { + FileNameMap key; + + key.relfilenumber = (RelFileNumber) rfn; + if (bsearch(&key, context->maps, context->size, + sizeof(FileNameMap), FileNameMapCmp)) + return 0; + } + + snprintf(dst, sizeof(dst), "%s/%s", context->target, filename); + if (rename(fname, dst) != 0) + pg_fatal("could not rename \"%s\" to \"%s\": %m", fname, dst); + + return 0; +} + +/* + * XXX: This proof-of-concept patch doesn't yet handle non-default tablespaces. + */ +static void +do_catalog_transfer(FileNameMap *maps, int size, char *old_tablespace) +{ + char old_tblspc[MAXPGPATH]; + char new_tblspc[MAXPGPATH]; + char old_dat[MAXPGPATH]; + char new_dat[MAXPGPATH]; + char moved_tblspc[MAXPGPATH]; + char moved_dat[MAXPGPATH]; + char old_cat[MAXPGPATH]; + move_catalog_file_context context; + DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; + + parse_sync_method(user_opts.sync_method, &sync_method); + + snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", + maps[0].old_tablespace, maps[0].old_tablespace_suffix); + snprintf(new_tblspc, sizeof(new_tblspc), "%s%s", + maps[0].new_tablespace, maps[0].new_tablespace_suffix); + snprintf(old_dat, sizeof(old_dat), "%s/%u", old_tblspc, maps[0].db_oid); + snprintf(new_dat, sizeof(new_dat), "%s/%u", new_tblspc, maps[0].db_oid); + snprintf(moved_tblspc, sizeof(moved_tblspc), "%s_moved", old_tblspc); + snprintf(moved_dat, sizeof(moved_dat), "%s/%u", + moved_tblspc, maps[0].db_oid); + snprintf(old_cat, sizeof(old_cat), "%s/%u_old_cat", + moved_tblspc, maps[0].db_oid); + + qsort(maps, size, sizeof(FileNameMap), FileNameMapCmp); + + /* create dir for stuff that is moved aside */ + if (pg_mkdir_p(moved_tblspc, pg_dir_create_mode) && errno != EEXIST) + pg_fatal("could not create directory \"%s\": %m", moved_tblspc); + + /* move new cluster data dir aside */ + if (rename(new_dat, moved_dat)) + pg_fatal("could not rename \"%s\" to \"%s\": %m", new_dat, moved_dat); + + /* move old cluster data dir in place */ + if (rename(old_dat, new_dat)) + pg_fatal("could not rename \"%s\" to \"%s\": %m", old_dat, new_dat); + + /* create dir for old catalogs */ + if (pg_mkdir_p(old_cat, pg_dir_create_mode)) + pg_fatal("could not create directory \"%s\": %m", old_cat); + + /* move catalogs in new data dir aside */ + context.maps = maps; + context.size = size; + context.target = old_cat; + walkdir(new_dat, move_catalog_file, false, &context); + + /* move catalogs in moved-aside data dir in place */ + context.target = new_dat; + walkdir(moved_dat, move_catalog_file, false, &context); + + /* no need to sync things individually if we are going to syncfs() later */ + if (sync_method == DATA_DIR_SYNC_METHOD_SYNCFS) + return; + + /* fsync directory entries */ + if (fsync_fname(moved_dat, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", moved_dat); + if (fsync_fname(old_cat, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", old_cat); + + /* + * XXX: We could instead fsync() these directories once at the end instead + * of once per-database, but it doesn't affect performance meaningfully, + * and this is just a proof-of-concept patch, so I haven't bothered doing + * the required refactoring yet. + */ + if (fsync_fname(old_tblspc, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", old_tblspc); + if (fsync_fname(moved_tblspc, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", moved_tblspc); +} + /* * transfer_single_new_db() * @@ -145,6 +296,18 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) vm_must_add_frozenbit = true; + /* + * XXX: In catalog-swap mode, vm_must_add_frozenbit isn't handled yet. We + * could either disallow using catalog-swap mode if the upgrade involves + * versions older than v9.6, or we could add code to handle rewriting the + * visibility maps in this mode (like the other modes do). + */ + if (user_opts.transfer_mode == TRANSFER_MODE_CATALOG_SWAP) + { + do_catalog_transfer(maps, size, old_tablespace); + return; + } + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || @@ -259,6 +422,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file); linkFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_CATALOG_SWAP: + pg_fatal("should never happen"); + break; } } } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 2d4c870423..f721f934c0 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3655,6 +3655,7 @@ mix_data_t mixedStruct mode_t movedb_failure_params +move_catalog_file_context multirange_bsearch_comparison multirange_unnest_fctx mxact -- 2.39.5 (Apple Git-154)
>From 44d566652775b2a88789e181d95a15b92ac913c6 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Tue, 5 Nov 2024 16:47:42 -0600 Subject: [PATCH v2 4/8] Add --no-sync-data-files flag to initdb. THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. This new mode caused 'initdb --sync-only' to synchronize everything except for the database directories. It will be used in a follow-up commit that aims to reduce the duration of the data synchronization step in pg_upgrade's catalog-swap mode. --- src/bin/initdb/initdb.c | 9 ++++-- src/bin/pg_basebackup/pg_basebackup.c | 2 +- src/bin/pg_checksums/pg_checksums.c | 2 +- src/bin/pg_combinebackup/pg_combinebackup.c | 2 +- src/bin/pg_rewind/file_ops.c | 2 +- src/common/file_utils.c | 35 ++++++++++++++++----- src/include/common/file_utils.h | 3 +- 7 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 9a91830783..53c6e86a80 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -168,6 +168,7 @@ static bool data_checksums = true; static char *xlog_dir = NULL; static int wal_segment_size_mb = (DEFAULT_XLOG_SEG_SIZE) / (1024 * 1024); static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +static bool sync_data_files = true; /* internal vars */ @@ -3183,6 +3184,7 @@ main(int argc, char *argv[]) {"icu-rules", required_argument, NULL, 18}, {"sync-method", required_argument, NULL, 19}, {"no-data-checksums", no_argument, NULL, 20}, + {"no-sync-data-files", no_argument, NULL, 21}, {NULL, 0, NULL, 0} }; @@ -3377,6 +3379,9 @@ main(int argc, char *argv[]) case 20: data_checksums = false; break; + case 21: + sync_data_files = false; + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -3428,7 +3433,7 @@ main(int argc, char *argv[]) fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files); check_ok(); return 0; } @@ -3491,7 +3496,7 @@ main(int argc, char *argv[]) { fputs(_("syncing data to disk ... "), stdout); fflush(stdout); - sync_pgdata(pg_data, PG_VERSION_NUM, sync_method); + sync_pgdata(pg_data, PG_VERSION_NUM, sync_method, sync_data_files); check_ok(); } else diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index e41a6cfbda..43526e3246 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -2310,7 +2310,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, } else { - (void) sync_pgdata(basedir, serverVersion, sync_method); + (void) sync_pgdata(basedir, serverVersion, sync_method, true); } } diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index b86bc417c9..06ccaacfda 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -633,7 +633,7 @@ main(int argc, char *argv[]) if (do_sync) { pg_log_info("syncing data directory"); - sync_pgdata(DataDir, PG_VERSION_NUM, sync_method); + sync_pgdata(DataDir, PG_VERSION_NUM, sync_method, true); } pg_log_info("updating control file"); diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 5f1f62f1db..80a137be4e 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -420,7 +420,7 @@ main(int argc, char *argv[]) else { pg_log_debug("recursively fsyncing \"%s\"", opt.output); - sync_pgdata(opt.output, version * 10000, opt.sync_method); + sync_pgdata(opt.output, version * 10000, opt.sync_method, true); } } diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 67a86bb4c5..ceb1c3ac6d 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -296,7 +296,7 @@ sync_target_dir(void) if (!do_sync || dry_run) return; - sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method); + sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method, true); } diff --git a/src/common/file_utils.c b/src/common/file_utils.c index dc90f35ae1..65cdf07ae7 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -94,7 +94,8 @@ do_syncfs(const char *path) void sync_pgdata(const char *pg_data, int serverVersion, - DataDirSyncMethod sync_method) + DataDirSyncMethod sync_method, + bool sync_data_files) { bool xlog_is_symlink; char pg_wal[MAXPGPATH]; @@ -184,10 +185,11 @@ sync_pgdata(const char *pg_data, * fsync the data directory and its contents. */ #ifdef PG_FLUSH_DATA_WORKS - walkdir(pg_data, pre_sync_fname, false, NULL); + walkdir(pg_data, pre_sync_fname, false, &sync_data_files); if (xlog_is_symlink) - walkdir(pg_wal, pre_sync_fname, false, NULL); - walkdir(pg_tblspc, pre_sync_fname, true, NULL); + walkdir(pg_wal, pre_sync_fname, false, &sync_data_files); + if (sync_data_files) + walkdir(pg_tblspc, pre_sync_fname, true, NULL); #endif /* @@ -200,10 +202,11 @@ sync_pgdata(const char *pg_data, * get fsync'd twice. That's not an expected case so we don't * worry about optimizing it. */ - walkdir(pg_data, fsync_fname, false, NULL); + walkdir(pg_data, fsync_fname, false, &sync_data_files); if (xlog_is_symlink) - walkdir(pg_wal, fsync_fname, false, NULL); - walkdir(pg_tblspc, fsync_fname, true, NULL); + walkdir(pg_wal, fsync_fname, false, &sync_data_files); + if (sync_data_files) + walkdir(pg_tblspc, fsync_fname, true, NULL); } break; } @@ -296,7 +299,23 @@ walkdir(const char *path, (*action) (subpath, false, arg); break; case PGFILETYPE_DIR: - walkdir(subpath, action, false, arg); + + /* + * XXX: Checking here for the "sync_data_files" case is quite + * hacky, but it's not clear how to do better. Another option + * would be to send "de" down to the function, but that would + * introduce a huge number of function pointer calls and + * directory reads that we are trying to avoid. + */ +#ifdef PG_FLUSH_DATA_WORKS + if ((action != pre_sync_fname && action != fsync_fname) || +#else + if (action != fsync_fname || +#endif + !arg || *((bool *) arg) || + strcmp(de->d_name, "base") != 0) + walkdir(subpath, action, false, arg); + break; default: diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index c328f56a85..3743caa63e 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -35,7 +35,8 @@ struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND extern int fsync_fname(const char *fname, bool isdir, void *arg); extern void sync_pgdata(const char *pg_data, int serverVersion, - DataDirSyncMethod sync_method); + DataDirSyncMethod sync_method, + bool sync_data_files); extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); -- 2.39.5 (Apple Git-154)
>From c911cee97239542e14a489ed84ab3b46a047659e Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 6 Nov 2024 09:52:19 -0600 Subject: [PATCH v2 5/8] Export pre_sync_fname(). THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. A follow-up commit will use this function to alert the file system that we want a file's data on disk so that subsequent calls to fsync() are faster. --- src/common/file_utils.c | 18 +++++------------- src/include/common/file_utils.h | 1 + 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 65cdf07ae7..5c201ec6e8 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -45,10 +45,6 @@ */ #define MINIMUM_VERSION_FOR_PG_WAL 100000 -#ifdef PG_FLUSH_DATA_WORKS -static int pre_sync_fname(const char *fname, bool isdir, void *arg); -#endif - #ifdef HAVE_SYNCFS /* @@ -307,11 +303,7 @@ walkdir(const char *path, * introduce a huge number of function pointer calls and * directory reads that we are trying to avoid. */ -#ifdef PG_FLUSH_DATA_WORKS if ((action != pre_sync_fname && action != fsync_fname) || -#else - if (action != fsync_fname || -#endif !arg || *((bool *) arg) || strcmp(de->d_name, "base") != 0) walkdir(subpath, action, false, arg); @@ -348,11 +340,12 @@ walkdir(const char *path, * Ignores errors trying to open unreadable files, and reports other errors * non-fatally. */ -#ifdef PG_FLUSH_DATA_WORKS - -static int +int pre_sync_fname(const char *fname, bool isdir, void *arg) { +#ifndef PG_FLUSH_DATA_WORKS + return 0; +#else int fd; fd = open(fname, O_RDONLY | PG_BINARY, 0); @@ -380,9 +373,8 @@ pre_sync_fname(const char *fname, bool isdir, void *arg) (void) close(fd); return 0; -} - #endif /* PG_FLUSH_DATA_WORKS */ +} /* * fsync_fname -- Try to fsync a file or directory diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 3743caa63e..e7a34d4c4e 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -43,6 +43,7 @@ extern int fsync_parent_path(const char *fname); extern void walkdir(const char *path, int (*action) (const char *fname, bool isdir, void *arg), bool process_symlinks, void *arg); +extern int pre_sync_fname(const char *fname, bool isdir, void *arg); #endif extern PGFileType get_dirent_type(const char *path, -- 2.39.5 (Apple Git-154)
>From 6565fea925c2bb51a03428fa0a40728588220eed Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 6 Nov 2024 10:40:43 -0600 Subject: [PATCH v2 6/8] In pg_upgrade's catalog-swap mode, only sync files as necessary. THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. In this mode, it can be much faster to use "--sync-method fsync", which now skips synchronizing data files moved from the old cluster (which we assumed were synchronized before pg_upgrade). --- src/bin/pg_upgrade/pg_upgrade.c | 6 ++-- src/bin/pg_upgrade/relfilenumber.c | 52 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 663235816f..f5946ac89a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -210,10 +210,12 @@ main(int argc, char **argv) { prep_status("Sync data directory to disk"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/initdb\" --sync-only \"%s\" --sync-method %s", + "\"%s/initdb\" --sync-only \"%s\" --sync-method %s %s", new_cluster.bindir, new_cluster.pgdata, - user_opts.sync_method); + user_opts.sync_method, + (user_opts.transfer_mode == TRANSFER_MODE_CATALOG_SWAP) ? + "--no-sync-data-files" : ""); check_ok(); } diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 9d8fce3c4a..dcca4bb2e7 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -25,8 +25,49 @@ typedef struct move_catalog_file_context FileNameMap *maps; int size; char *target; + bool sync_moved; } move_catalog_file_context; +#define SYNC_QUEUE_MAX_LEN (1024) + +static char *sync_queue[SYNC_QUEUE_MAX_LEN]; +static bool sync_queue_inited; +static int sync_queue_len; + +static inline void +sync_queue_init(void) +{ + if (sync_queue_inited) + return; + + sync_queue_inited = true; + for (int i = 0; i < SYNC_QUEUE_MAX_LEN; i++) + sync_queue[i] = palloc(MAXPGPATH); +} + +static inline void +sync_queue_sync_all(void) +{ + if (!sync_queue_inited) + return; + + for (int i = 0; i < sync_queue_len; i++) + fsync_fname(sync_queue[i], false, NULL); + sync_queue_len = 0; +} + +static inline void +sync_queue_push(const char *fname) +{ + sync_queue_init(); + + pre_sync_fname(fname, false, NULL); + + strncpy(sync_queue[sync_queue_len++], fname, MAXPGPATH); + if (sync_queue_len >= SYNC_QUEUE_MAX_LEN) + sync_queue_sync_all(); +} + /* * transfer_all_new_tablespaces() * @@ -138,6 +179,8 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, /* We allocate something even for n_maps == 0 */ pg_free(mappings); } + + sync_queue_sync_all(); } static int @@ -195,6 +238,9 @@ move_catalog_file(const char *fname, bool isdir, void *arg) if (rename(fname, dst) != 0) pg_fatal("could not rename \"%s\" to \"%s\": %m", fname, dst); + if (context->sync_moved) + sync_queue_push(dst); + return 0; } @@ -250,10 +296,12 @@ do_catalog_transfer(FileNameMap *maps, int size, char *old_tablespace) context.maps = maps; context.size = size; context.target = old_cat; + context.sync_moved = false; walkdir(new_dat, move_catalog_file, false, &context); /* move catalogs in moved-aside data dir in place */ context.target = new_dat; + context.sync_moved = (sync_method != DATA_DIR_SYNC_METHOD_SYNCFS); walkdir(moved_dat, move_catalog_file, false, &context); /* no need to sync things individually if we are going to syncfs() later */ @@ -265,6 +313,8 @@ do_catalog_transfer(FileNameMap *maps, int size, char *old_tablespace) pg_fatal("could not synchronize directory \"%s\": %m", moved_dat); if (fsync_fname(old_cat, true, NULL) != 0) pg_fatal("could not synchronize directory \"%s\": %m", old_cat); + if (fsync_fname(new_dat, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", new_dat); /* * XXX: We could instead fsync() these directories once at the end instead @@ -276,6 +326,8 @@ do_catalog_transfer(FileNameMap *maps, int size, char *old_tablespace) pg_fatal("could not synchronize directory \"%s\": %m", old_tblspc); if (fsync_fname(moved_tblspc, true, NULL) != 0) pg_fatal("could not synchronize directory \"%s\": %m", moved_tblspc); + if (fsync_fname(new_tblspc, true, NULL) != 0) + pg_fatal("could not synchronize directory \"%s\": %m", new_tblspc); } /* -- 2.39.5 (Apple Git-154)
>From d60d05908793a1850c6e34979924d1502449bbfc Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 6 Nov 2024 10:46:11 -0600 Subject: [PATCH v2 7/8] Add --sequence-data flag to pg_dump. THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. This flag can be used to optionally dump the sequence data even when --schema-only is used. It is primarily intended for use in a follow-up commit that will cause sequence data files to be carried over from the old cluster in pg_upgrade's new catalog-swap mode. --- src/bin/pg_dump/pg_dump.c | 9 +-------- src/bin/pg_upgrade/dump.c | 2 +- src/test/modules/test_pg_dump/t/001_base.pl | 2 +- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index add7f16c90..a0810aaefd 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -507,6 +507,7 @@ main(int argc, char **argv) {"sync-method", required_argument, NULL, 15}, {"filter", required_argument, NULL, 16}, {"exclude-extension", required_argument, NULL, 17}, + {"sequence-data", no_argument, &dopt.sequence_data, 1}, {NULL, 0, NULL, 0} }; @@ -774,14 +775,6 @@ main(int argc, char **argv) if (dopt.column_inserts && dopt.dump_inserts == 0) dopt.dump_inserts = DUMP_DEFAULT_ROWS_PER_INSERT; - /* - * Binary upgrade mode implies dumping sequence data even in schema-only - * mode. This is not exposed as a separate option, but kept separate - * internally for clarity. - */ - if (dopt.binary_upgrade) - dopt.sequence_data = 1; - if (data_only && schema_only) pg_fatal("options -s/--schema-only and -a/--data-only cannot be used together"); diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index 8345f55be8..8453722833 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -53,7 +53,7 @@ generate_old_dump(void) parallel_exec_prog(log_file_name, NULL, "\"%s/pg_dump\" %s --schema-only --quote-all-identifiers " - "--binary-upgrade --format=custom %s --no-sync --file=\"%s/%s\" %s", + "--binary-upgrade --sequence-data --format=custom %s --no-sync --file=\"%s/%s\" %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), log_opts.verbose ? "--verbose" : "", log_opts.dumpdir, diff --git a/src/test/modules/test_pg_dump/t/001_base.pl b/src/test/modules/test_pg_dump/t/001_base.pl index e2579e29cd..46231c93f1 100644 --- a/src/test/modules/test_pg_dump/t/001_base.pl +++ b/src/test/modules/test_pg_dump/t/001_base.pl @@ -48,7 +48,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', "--file=$tempdir/binary_upgrade.sql", '--schema-only', - '--binary-upgrade', '--dbname=postgres', + '--binary-upgrade', '--sequence-data', '--dbname=postgres', ], }, clean => { -- 2.39.5 (Apple Git-154)
>From 0d7af5d0333b71d3f583077d3c2bd6cdb06fbf79 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 6 Nov 2024 10:53:40 -0600 Subject: [PATCH v2 8/8] Avoid copying sequence files in pg_upgrade's catalog-swap mode. THIS IS A PROOF OF CONCEPT AND IS NOT READY FOR SERIOUS REVIEW. On clusters with many sequences, this can further reduce the amount of time required to wire up the data files in the new cluster. If the sequence data file format changes, this optimization cannot be used, but that seems rare enough. --- src/bin/pg_upgrade/dump.c | 8 +++++++- src/bin/pg_upgrade/info.c | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index 8453722833..d5a81cc29c 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -51,10 +51,16 @@ generate_old_dump(void) snprintf(sql_file_name, sizeof(sql_file_name), DB_DUMP_FILE_MASK, old_db->db_oid); snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid); + /* + * XXX: We need to be sure that the sequence data format hasn't + * changed. + */ parallel_exec_prog(log_file_name, NULL, "\"%s/pg_dump\" %s --schema-only --quote-all-identifiers " - "--binary-upgrade --sequence-data --format=custom %s --no-sync --file=\"%s/%s\" %s", + "--binary-upgrade %s --format=custom %s --no-sync --file=\"%s/%s\" %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), + (user_opts.transfer_mode == TRANSFER_MODE_CATALOG_SWAP) ? + "" : "--sequence-data", log_opts.verbose ? "--verbose" : "", log_opts.dumpdir, sql_file_name, escaped_connstr.data); diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index f83ded89cb..786d17e32f 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -483,6 +483,8 @@ get_rel_infos_query(void) * pg_largeobject contains user data that does not appear in pg_dump * output, so we have to copy that system table. It's easiest to do that * by treating it as a user table. + * + * XXX: We need to be sure that the sequence data format hasn't changed. */ appendPQExpBuffer(&query, "WITH regular_heap (reloid, indtable, toastheap) AS ( " @@ -490,7 +492,7 @@ get_rel_infos_query(void) " FROM pg_catalog.pg_class c JOIN pg_catalog.pg_namespace n " " ON c.relnamespace = n.oid " " WHERE relkind IN (" CppAsString2(RELKIND_RELATION) ", " - CppAsString2(RELKIND_MATVIEW) ") AND " + CppAsString2(RELKIND_MATVIEW) "%s) AND " /* exclude possible orphaned temp tables */ " ((n.nspname !~ '^pg_temp_' AND " " n.nspname !~ '^pg_toast_temp_' AND " @@ -499,6 +501,8 @@ get_rel_infos_query(void) " c.oid >= %u::pg_catalog.oid) OR " " (n.nspname = 'pg_catalog' AND " " relname IN ('pg_largeobject') ))), ", + (user_opts.transfer_mode == TRANSFER_MODE_CATALOG_SWAP) ? + ", " CppAsString2(RELKIND_SEQUENCE) : "", FirstNormalObjectId); /* -- 2.39.5 (Apple Git-154)