On Thu, Jan 24, 2019 at 5:19 AM Amit Kapila <amit.kapil...@gmail.com> wrote: > > 1. > + if ((maps[mapnum].relkind != RELKIND_RELATION && > + maps[mapnum].relkind != RELKIND_TOASTVALUE) || > + first_seg_size > HEAP_FSM_CREATION_THRESHOLD * BLCKSZ || > + GET_MAJOR_VERSION(new_cluster.major_version) <= 1100) > + (void) transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit); > > I think this check will needlessly be performed for future versions as > well, say when wants to upgrade from PG12 to PG13. That might not > create any problem, but let's try to be more precise. Can you try to > rewrite this check? You might want to encapsulate it inside a > function. I have thought of doing something similar to what we do for > vm, see checks relate to VISIBILITY_MAP_FROZEN_BIT_CAT_VER, but I > guess for this patch it is not important to check catalog version as > even if someone tries to upgrade to the same version.
Agreed, done for v19 (I've only attached the pg_upgrade patch). > 2. > transfer_relfile() > { > .. > - /* Is it an extent, fsm, or vm file? */ > - if (type_suffix[0] != '\0' || segno != 0) > + /* Did file open fail? */ > + if (stat(old_file, &statbuf) != 0) > .. > } > > So from now onwards, we will call stat for even 0th segment which > means there is one additional system call for each relation, not sure > if that matters, but I think there is no harm in once testing with a > large number of relations say 10K to 50K relations which have FSM. Performance testing is probably a good idea anyway, but I went ahead and implemented your next idea: > The other alternative is we can fetch pg_class.relpages and rely on > that to take this decision, but again if that is not updated, we might > take the wrong decision. We can think of it this way: Which is worse, 1. Transferring a FSM we don't need, or 2. Skipping a FSM we need I'd say #2 is worse. So, in v19 we check pg_class.relpages and if it's a heap and less than or equal the threshold we call stat on the 0th segment to verify. In the common case, the cost of the stat call is offset by not linking the FSM. Despite needing another pg_class field, I think this code is actually easier to read than my earlier versions. > 3. > -static void > +static Size > transfer_relfile(FileNameMap *map, const char *type_suffix, bool > vm_must_add_frozenbit) > > If we decide to go with the approach proposed by you, we should add > some comments atop this function for return value change? Done, as well as other comment edits. -- John Naylor https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
From 277d969c31349b22e2c7726935d681e72aacaece Mon Sep 17 00:00:00 2001 From: John Naylor <jcnay...@gmail.com> Date: Fri, 25 Jan 2019 17:18:28 -0500 Subject: [PATCH v19 3/3] During pg_upgrade, conditionally skip transfer of FSMs. If a heap on the old cluster has 4 pages or fewer, don't copy or link the FSM. This will reduce space usage for installations with large numbers of small tables. --- src/bin/pg_upgrade/info.c | 18 ++++++- src/bin/pg_upgrade/pg_upgrade.h | 6 +++ src/bin/pg_upgrade/relfilenode.c | 84 +++++++++++++++++++++++++++----- 3 files changed, 93 insertions(+), 15 deletions(-) diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 2f925f086c..55d4911d10 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -200,6 +200,8 @@ create_rel_filename_map(const char *old_data, const char *new_data, map->old_db_oid = old_db->db_oid; map->new_db_oid = new_db->db_oid; + map->relpages = old_rel->relpages; + map->relkind = old_rel->relkind; /* * old_relfilenode might differ from pg_class.oid (and hence @@ -415,9 +417,11 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) int ntups; int relnum; int num_rels = 0; + int relpages; char *nspname = NULL; char *relname = NULL; char *tablespace = NULL; + char *relkind = NULL; int i_spclocation, i_nspname, i_relname, @@ -425,7 +429,9 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_indtable, i_toastheap, i_relfilenode, - i_reltablespace; + i_reltablespace, + i_relpages, + i_relkind; char query[QUERY_ALLOC]; char *last_namespace = NULL, *last_tablespace = NULL; @@ -494,7 +500,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) */ snprintf(query + strlen(query), sizeof(query) - strlen(query), "SELECT all_rels.*, n.nspname, c.relname, " - " c.relfilenode, c.reltablespace, %s " + " c.relfilenode, c.reltablespace, c.relpages, c.relkind, %s " "FROM (SELECT * FROM regular_heap " " UNION ALL " " SELECT * FROM toast_heap " @@ -526,6 +532,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) i_relfilenode = PQfnumber(res, "relfilenode"); i_reltablespace = PQfnumber(res, "reltablespace"); i_spclocation = PQfnumber(res, "spclocation"); + i_relpages = PQfnumber(res, "relpages"); + i_relkind = PQfnumber(res, "relkind"); for (relnum = 0; relnum < ntups; relnum++) { @@ -555,6 +563,12 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) relname = PQgetvalue(res, relnum, i_relname); curr->relname = pg_strdup(relname); + relpages = atoi(PQgetvalue(res, relnum, i_relpages)); + curr->relpages = relpages; + + relkind = PQgetvalue(res, relnum, i_relkind); + curr->relkind = relkind[0]; + curr->relfilenode = atooid(PQgetvalue(res, relnum, i_relfilenode)); curr->tblsp_alloc = false; diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 2f67eee22b..baeb8ff0f8 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -147,6 +147,8 @@ typedef struct char *tablespace; /* tablespace path; "" for cluster default */ bool nsp_alloc; /* should nspname be freed? */ bool tblsp_alloc; /* should tablespace be freed? */ + int32 relpages; /* # of pages -- see pg_class.h */ + char relkind; /* relation kind -- see pg_class.h */ } RelInfo; typedef struct @@ -173,6 +175,10 @@ typedef struct */ Oid old_relfilenode; Oid new_relfilenode; + + int32 relpages; /* # of pages -- see pg_class.h */ + char relkind; /* relation kind -- see pg_class.h */ + /* the rest are used only for logging and error reporting */ char *nspname; /* namespaces */ char *relname; diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index 0c78073f0e..ffcec23d7c 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -14,10 +14,12 @@ #include <sys/stat.h> #include "catalog/pg_class_d.h" #include "access/transam.h" +#include "storage/freespace.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); -static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit); +static Size transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit); +static bool new_cluster_needs_fsm(char relkind, Size first_seg_size); /* @@ -144,6 +146,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) int mapnum; bool vm_crashsafe_match = true; bool vm_must_add_frozenbit = false; + Size first_seg_size = 0; /* * Do the old and new cluster disagree on the crash-safetiness of the vm @@ -165,18 +168,22 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) if (old_tablespace == NULL || strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0) { - /* transfer primary file */ - transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit); + /* Transfer main fork and return size of the first segment. */ + first_seg_size = transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) { /* - * Copy/link any fsm and vm files, if they exist + * Transfer any FSM files if they would be created in the + * new cluster. */ - transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit); + if (new_cluster_needs_fsm(maps[mapnum].relkind, first_seg_size)) + (void) transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit); + + /* Transfer any VM files if we can trust their contents. */ if (vm_crashsafe_match) - transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit); + (void) transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit); } } } @@ -188,9 +195,10 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) * * Copy or link file from old cluster to new one. If vm_must_add_frozenbit * is true, visibility map forks are converted and rewritten, even in link - * mode. + * mode. Returns size of the first segment. We only care about the accuracy + * of the size for small heap relations. */ -static void +static Size transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit) { char old_file[MAXPGPATH]; @@ -198,6 +206,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro int segno; char extent_suffix[65]; struct stat statbuf; + Size first_seg_size = 0; /* * Now copy/link any related segments as well. Remember, PG breaks large @@ -234,16 +243,40 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro { /* File does not exist? That's OK, just return */ if (errno == ENOENT) - return; + return first_seg_size; else - pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" to \"%s\"): %s\n", - map->nspname, map->relname, old_file, new_file, - strerror(errno)); + goto fatal; } /* If file is empty, just return */ if (statbuf.st_size == 0) - return; + return first_seg_size; + } + + /* Save size of the first segment of the main fork. */ + + else if (map->relpages <= HEAP_FSM_CREATION_THRESHOLD && + (map->relkind == RELKIND_RELATION || + map->relkind == RELKIND_TOASTVALUE)) + { + /* + * In this case, if pg_class.relpages is wrong, it's possible + * that a FSM will be skipped when we actually need it. To guard + * against this, we verify the size of the first segment. + */ + if (stat(old_file, &statbuf) != 0) + goto fatal; + else + first_seg_size = statbuf.st_size; + } + else + { + /* + * For indexes etc., we don't care if pg_class.relpages is wrong, + * since we always transfer their FSMs. For heaps, we might + * transfer a FSM when we don't need to, but this is harmless. + */ + first_seg_size = Min(map->relpages, RELSEG_SIZE) * BLCKSZ; } unlink(new_file); @@ -277,4 +310,29 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro linkFile(old_file, new_file, map->nspname, map->relname); } } + +fatal: + pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + map->nspname, map->relname, old_file, new_file, + strerror(errno)); +} + +/* + * Return false for small heaps if we're upgrading across version 12, + * the first where small heap relations don't have FSMs by default. + */ +static bool +new_cluster_needs_fsm(char relkind, Size first_seg_size) +{ + if (relkind != RELKIND_RELATION && relkind != RELKIND_TOASTVALUE) + return true; + + if (first_seg_size > HEAP_FSM_CREATION_THRESHOLD * BLCKSZ) + return true; + + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1100 && + GET_MAJOR_VERSION(new_cluster.major_version) >= 1200) + return false; + else + return true; } -- 2.17.1