Re: Horribly slow pg_upgrade performance with many Large Objects

Nathan Bossart Wed, 07 May 2025 07:51:47 -0700

On Mon, May 05, 2025 at 02:23:25PM -0500, Nathan Bossart wrote:
> That leaves pg_shdepend.  For now, I've just instructed pg_upgrade to COPY
> the relevant pg_shdepend rows as an independent step, but perhaps there's a
> reasonably straightforward way to put that in pg_dump, too.


It turns out there is.  TableDataInfo's filtercond field can be used to
easily add a WHERE clause to the data dumping command.  On my laptop,
upgrading with --jobs=8 with 10M large objects evenly distributed across 10
databases (each with a non-bootstrap-superuser owner and another role with
select rights) takes ~100 seconds without this patch and ~30 seconds with
it.

I've also added dependency tracking, version checks (this only works for
upgrades from >=v12 for now), a hack to ensure the columns for
pg_largeobject_metadata/pg_shdepend are collected, and comments.  I'm sure
there's something I've missed, but this patch has worked well in my tests
thus far.

Taking a step back, I'm a little disappointed in the gains here.  A 3-9x
speedup is nice, but I guess I was hoping to find another order of
magnitude somewhere.  To do any better, I think we'd need to copy the files
for pg_largeobject_metadata directly for upgrades from >= v16, but that
would have to fit somewhere between when pg_restore creates the database
and when it restores any large object comments/seclabels.  I'm not wild
about the amount of hackery required to get that working.

-- 
nathan

>From b14b74a4ad3ffff60c2fcb3c241766a651d681c5 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 7 May 2025 09:06:05 -0500
Subject: [PATCH v2 1/1] pg_upgrade: Use COPY for large object metadata.

Reported-by: Hannu Krosing <han...@google.com>
Suggested-by: Tom Lane <t...@sss.pgh.pa.us>
Discussion: 
https://postgr.es/m/CAMT0RQSS-6qLH%2BzYsOeUbAYhop3wmQTkNmQpo5--QRDUR%2BqYmQ%40mail.gmail.com
---
 src/bin/pg_dump/pg_backup_archiver.c | 13 +++++
 src/bin/pg_dump/pg_dump.c            | 78 ++++++++++++++++++++++++++--
 src/bin/pg_dump/pg_dump_sort.c       |  2 +-
 src/bin/pg_dump/t/002_pg_dump.pl     |  4 +-
 4 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_archiver.c 
b/src/bin/pg_dump/pg_backup_archiver.c
index afa42337b11..4c387b0cb5e 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -31,6 +31,8 @@
 #endif
 
 #include "catalog/pg_class_d.h"
+#include "catalog/pg_largeobject_metadata.h"
+#include "catalog/pg_shdepend.h"
 #include "common/string.h"
 #include "compress_io.h"
 #include "dumputils.h"
@@ -2974,6 +2976,17 @@ _tocEntryRequired(TocEntry *te, teSection curSection, 
ArchiveHandle *AH)
        int                     res = REQ_SCHEMA | REQ_DATA;
        RestoreOptions *ropt = AH->public.ropt;
 
+       /*
+        * For binary upgrade mode, dump pg_largeobject_metadata and the
+        * associated pg_shdepend rows. This is faster to restore than the
+        * equivalent set of large object commands.
+        */
+       if (ropt->binary_upgrade && AH->public.remoteVersion >= 120000 &&
+               strcmp(te->desc, "TABLE DATA") == 0 &&
+               (te->catalogId.oid == LargeObjectMetadataRelationId ||
+                te->catalogId.oid == SharedDependRelationId))
+               return REQ_DATA;
+
        /* These items are treated specially */
        if (strcmp(te->desc, "ENCODING") == 0 ||
                strcmp(te->desc, "STDSTRINGS") == 0 ||
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index e2e7975b34e..1666a4ec40f 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -49,8 +49,10 @@
 #include "catalog/pg_class_d.h"
 #include "catalog/pg_default_acl_d.h"
 #include "catalog/pg_largeobject_d.h"
+#include "catalog/pg_largeobject_metadata_d.h"
 #include "catalog/pg_proc_d.h"
 #include "catalog/pg_publication_d.h"
+#include "catalog/pg_shdepend.h"
 #include "catalog/pg_subscription_d.h"
 #include "catalog/pg_type_d.h"
 #include "common/connect.h"
@@ -209,6 +211,12 @@ static int nbinaryUpgradeClassOids = 0;
 static SequenceItem *sequences = NULL;
 static int     nsequences = 0;
 
+/*
+ * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use
+ * as a dependency for pg_shdepend and any large object comments/seclabels.
+ */
+static DumpId lo_metadata_dumpId;
+
 /* Maximum number of relations to fetch in a fetchAttributeStats() call. */
 #define MAX_ATTR_STATS_RELS 64
 
@@ -1083,6 +1091,34 @@ main(int argc, char **argv)
        if (!dopt.dumpData && dopt.sequence_data)
                getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE);
 
+       /*
+        * For binary upgrade mode, dump pg_largeobject_metadata and the
+        * associated pg_shdepend rows. This is faster to restore than the
+        * equivalent set of large object commands.
+        */
+       if (dopt.binary_upgrade && fout->remoteVersion >= 120000)
+       {
+               TableInfo  *lo_metadata = 
findTableByOid(LargeObjectMetadataRelationId);
+               TableInfo  *shdepend = findTableByOid(SharedDependRelationId);
+
+               makeTableDataInfo(&dopt, lo_metadata);
+               makeTableDataInfo(&dopt, shdepend);
+
+               /*
+                * Save pg_largeobject_metadata's dump ID for use as a 
dependency on
+                * pg_shdepend and any large object comments/seclabels.
+                */
+               lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId;
+               addObjectDependency(&shdepend->dataObj->dobj, 
lo_metadata_dumpId);
+
+               /*
+                * Only dump large object shdepend rows for this database.
+                */
+               shdepend->dataObj->filtercond = "WHERE classid = 
'pg_largeobject'::regclass "
+                       "AND dbid = (SELECT oid FROM pg_database "
+                       "            WHERE datname = current_database())";
+       }
+
        /*
         * In binary-upgrade mode, we do not have to worry about the actual LO
         * data or the associated metadata that resides in the pg_largeobject 
and
@@ -3922,10 +3958,29 @@ getLOs(Archive *fout)
                 * as it will be copied by pg_upgrade, which simply copies the
                 * pg_largeobject table. We *do* however dump out anything but 
the
                 * data, as pg_upgrade copies just pg_largeobject, but not
-                * pg_largeobject_metadata, after the dump is restored.
+                * pg_largeobject_metadata, after the dump is restored.  In 
versions
+                * before v12, this is done via proper large object commands.  
In
+                * newer versions, we dump the content of 
pg_largeobject_metadata and
+                * any associated pg_shdepend rows, which is faster to restore.
                 */
                if (dopt->binary_upgrade)
-                       loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA;
+               {
+                       if (fout->remoteVersion >= 120000)
+                       {
+                               loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | 
DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION);
+
+                               /*
+                                * Mark the large object as dependent on
+                                * pg_largeobject_metadata so that any large 
object
+                                * comments/seclables are dumped after it.
+                                */
+                               loinfo->dobj.dependencies = (DumpId *) 
pg_malloc(sizeof(DumpId));
+                               loinfo->dobj.dependencies[0] = 
lo_metadata_dumpId;
+                               loinfo->dobj.nDeps = loinfo->dobj.allocDeps = 1;
+                       }
+                       else
+                               loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA;
+               }
 
                /*
                 * Create a "BLOBS" data item for the group, too. This is just a
@@ -9034,8 +9089,18 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int 
numTables)
                if (tbinfo->relkind == RELKIND_SEQUENCE)
                        continue;
 
-               /* Don't bother with uninteresting tables, either */
-               if (!tbinfo->interesting)
+               /*
+                * Don't bother with uninteresting tables, either.  For binary
+                * upgrades, this is bypassed for pg_largeobject_metadata and
+                * pg_shdepend so that the columns names are collected for the
+                * corresponding COPY commands.  Restoring the data for those 
catalogs
+                * is faster than restoring the equivalent set of large object
+                * commands.
+                */
+               if (!tbinfo->interesting &&
+                       !(fout->dopt->binary_upgrade && fout->remoteVersion >= 
120000 &&
+                         (tbinfo->dobj.catId.oid == 
LargeObjectMetadataRelationId ||
+                          tbinfo->dobj.catId.oid == SharedDependRelationId)))
                        continue;
 
                /* OK, we need info for this table */
@@ -9232,7 +9297,10 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int 
numTables)
                        pg_fatal("unrecognized table OID %u", attrelid);
                /* cross-check that we only got requested tables */
                if (tbinfo->relkind == RELKIND_SEQUENCE ||
-                       !tbinfo->interesting)
+                       (!tbinfo->interesting &&
+                        !(fout->dopt->binary_upgrade && fout->remoteVersion >= 
120000 &&
+                          (tbinfo->dobj.catId.oid == 
LargeObjectMetadataRelationId ||
+                               tbinfo->dobj.catId.oid == 
SharedDependRelationId))))
                        pg_fatal("unexpected column data for table \"%s\"",
                                         tbinfo->dobj.name);
 
diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c
index 0b0977788f1..538e7dcb493 100644
--- a/src/bin/pg_dump/pg_dump_sort.c
+++ b/src/bin/pg_dump/pg_dump_sort.c
@@ -76,10 +76,10 @@ enum dbObjectTypePriorities
        PRIO_TABLE_ATTACH,
        PRIO_DUMMY_TYPE,
        PRIO_ATTRDEF,
-       PRIO_LARGE_OBJECT,
        PRIO_PRE_DATA_BOUNDARY,         /* boundary! */
        PRIO_TABLE_DATA,
        PRIO_SEQUENCE_SET,
+       PRIO_LARGE_OBJECT,
        PRIO_LARGE_OBJECT_DATA,
        PRIO_STATISTICS_DATA_DATA,
        PRIO_POST_DATA_BOUNDARY,        /* boundary! */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 55d892d9c16..2417f537ac1 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -1029,6 +1029,7 @@ my %tests = (
                        test_schema_plus_large_objects => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        no_large_objects => 1,
                        no_owner => 1,
                        schema_only => 1,
@@ -1517,6 +1518,7 @@ my %tests = (
                        test_schema_plus_large_objects => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        schema_only => 1,
                        schema_only_with_statistics => 1,
                        no_large_objects => 1,
@@ -4524,9 +4526,9 @@ my %tests = (
                        no_schema => 1,
                        section_data => 1,
                        test_schema_plus_large_objects => 1,
-                       binary_upgrade => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        no_large_objects => 1,
                        no_privs => 1,
                        schema_only => 1,
-- 
2.39.5 (Apple Git-154)

Re: Horribly slow pg_upgrade performance with many Large Objects

Reply via email to