Re: Horribly slow pg_upgrade performance with many Large Objects

Nathan Bossart Mon, 05 May 2025 12:23:43 -0700

On Tue, Apr 08, 2025 at 01:07:09PM -0400, Tom Lane wrote:
> Nathan Bossart <nathandboss...@gmail.com> writes:
>> I do think it's worth considering going back to copying
>> pg_largobject_metadata's files for upgrades from v16 and newer.
> 
> (If we do this) I don't see why we'd need to stop at v16.  I'm
> envisioning that we'd use COPY, which will be dealing in the
> text representation of aclitems, and I don't think that's changed
> in a long time.  The sort of thing that would break it is changes
> in the set of available/default privilege bits for large objects.
> 
> That is, where the dump currently contains something like
> 
> SELECT pg_catalog.lo_create('2121');
> ALTER LARGE OBJECT 2121 OWNER TO postgres;
> GRANT ALL ON LARGE OBJECT 2121 TO joe;
> 
> we'd have
> 
> COPY pg_largeobject_metadata FROM STDIN;
> ...
> 2121  10      {postgres=rw/postgres,joe=rw/postgres}
> ...
> 
> and some appropriate COPY data for pg_shdepend too.


Attached is a proof-of-concept grade patch for using COPY for
pg_largeobject_metadata and the relevant pg_shdepend entries.  On my
laptop, pg_upgrade with 10M LOs (each with a non-bootstrap-superuser owner
and another role with SELECT rights) goes from ~8.5 minutes to ~1 minute
with this patch.

I originally set out to invent a new lo_create_with_owner() function and
teach pg_dump to batch those together in large groups, but as I started the
required pg_dump surgery, I was quickly scared away by the complexity.
Next, I gave COPY a try.  The improvements from using COPY will likely be
limited to the pg_upgrade case, but that's the only case I regularly hear
complaints about for zillions of large objects, so maybe it's good enough
for now.

For the COPY approach, I modified pg_dump to dump the contents of
pg_largeobject_metadata.  This is easy enough, but I ran into problems with
the dependent comments and security labels.  It turns out that even before
v12, we run all the lo_create() commands just so that creating the comments
and security labels works (AFAICT).  So I suspect upgrading with many large
objects has always been slow.  The comment/security label dependency issue
can be fixed (at least well enough for the tests) by moving
PRIO_LARGE_OBJECT below PRIO_TABLE_DATA.  There might be an existing issue
here, because dbObjectTypePriorities has the following comment:

 * NOTE: object-type priorities must match the section assignments made in
 * pg_dump.c; that is, PRE_DATA objects must sort before DO_PRE_DATA_BOUNDARY,
 * POST_DATA objects must sort after DO_POST_DATA_BOUNDARY, and DATA objects
 * must sort between them.

But dumpLO() puts large objects in SECTION_DATA, and PRIO_LARGE_OBJECT is
before PRIO_PRE_DATA_BOUNDARY.  I admittedly haven't spent too much time
investigating this, though.  In any case, it might be a good idea to also
make sure we explicitly mark the large objects and their comments/seclabels
as dependent on the pg_largeobject_metadata data.

That leaves pg_shdepend.  For now, I've just instructed pg_upgrade to COPY
the relevant pg_shdepend rows as an independent step, but perhaps there's a
reasonably straightforward way to put that in pg_dump, too.

-- 
nathan

>From 59fa6558a9258719840e4d423c86313937156f95 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 5 May 2025 14:11:56 -0500
Subject: [PATCH v1 1/1] pg_upgrade: Use COPY for large object metadata.

---
 src/bin/pg_dump/pg_backup_archiver.c |  5 +++++
 src/bin/pg_dump/pg_dump.c            | 16 +++++++++++++++-
 src/bin/pg_dump/pg_dump_sort.c       |  2 +-
 src/bin/pg_dump/t/002_pg_dump.pl     |  4 +++-
 src/bin/pg_upgrade/dump.c            |  9 +++++++++
 src/bin/pg_upgrade/pg_upgrade.c      |  9 +++++++++
 6 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_archiver.c 
b/src/bin/pg_dump/pg_backup_archiver.c
index afa42337b11..0d61155ec0a 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -2974,6 +2974,11 @@ _tocEntryRequired(TocEntry *te, teSection curSection, 
ArchiveHandle *AH)
        int                     res = REQ_SCHEMA | REQ_DATA;
        RestoreOptions *ropt = AH->public.ropt;
 
+       if (ropt->binary_upgrade &&
+               strcmp(te->tag, "pg_largeobject_metadata") == 0 &&
+               strcmp(te->namespace, "pg_catalog") == 0)
+               return REQ_DATA;
+
        /* These items are treated specially */
        if (strcmp(te->desc, "ENCODING") == 0 ||
                strcmp(te->desc, "STDSTRINGS") == 0 ||
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index e2e7975b34e..312e1010456 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -1083,6 +1083,20 @@ main(int argc, char **argv)
        if (!dopt.dumpData && dopt.sequence_data)
                getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE);
 
+       if (dopt.binary_upgrade)
+       {
+               for (int i = 0; i < numTables; i++)
+               {
+                       if (tblinfo[i].relkind == RELKIND_RELATION &&
+                               strcmp(tblinfo[i].dobj.name, 
"pg_largeobject_metadata") == 0 &&
+                               strcmp(tblinfo[i].dobj.namespace->dobj.name, 
"pg_catalog") == 0)
+                       {
+                               makeTableDataInfo(&dopt, &(tblinfo[i]));
+                               break;
+                       }
+               }
+       }
+
        /*
         * In binary-upgrade mode, we do not have to worry about the actual LO
         * data or the associated metadata that resides in the pg_largeobject 
and
@@ -3925,7 +3939,7 @@ getLOs(Archive *fout)
                 * pg_largeobject_metadata, after the dump is restored.
                 */
                if (dopt->binary_upgrade)
-                       loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA;
+                       loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | 
DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION);
 
                /*
                 * Create a "BLOBS" data item for the group, too. This is just a
diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c
index 0b0977788f1..538e7dcb493 100644
--- a/src/bin/pg_dump/pg_dump_sort.c
+++ b/src/bin/pg_dump/pg_dump_sort.c
@@ -76,10 +76,10 @@ enum dbObjectTypePriorities
        PRIO_TABLE_ATTACH,
        PRIO_DUMMY_TYPE,
        PRIO_ATTRDEF,
-       PRIO_LARGE_OBJECT,
        PRIO_PRE_DATA_BOUNDARY,         /* boundary! */
        PRIO_TABLE_DATA,
        PRIO_SEQUENCE_SET,
+       PRIO_LARGE_OBJECT,
        PRIO_LARGE_OBJECT_DATA,
        PRIO_STATISTICS_DATA_DATA,
        PRIO_POST_DATA_BOUNDARY,        /* boundary! */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 55d892d9c16..2417f537ac1 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -1029,6 +1029,7 @@ my %tests = (
                        test_schema_plus_large_objects => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        no_large_objects => 1,
                        no_owner => 1,
                        schema_only => 1,
@@ -1517,6 +1518,7 @@ my %tests = (
                        test_schema_plus_large_objects => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        schema_only => 1,
                        schema_only_with_statistics => 1,
                        no_large_objects => 1,
@@ -4524,9 +4526,9 @@ my %tests = (
                        no_schema => 1,
                        section_data => 1,
                        test_schema_plus_large_objects => 1,
-                       binary_upgrade => 1,
                },
                unlike => {
+                       binary_upgrade => 1,
                        no_large_objects => 1,
                        no_privs => 1,
                        schema_only => 1,
diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c
index 23cb08e8347..e5b73b7e7a0 100644
--- a/src/bin/pg_upgrade/dump.c
+++ b/src/bin/pg_upgrade/dump.c
@@ -15,6 +15,9 @@
 void
 generate_old_dump(void)
 {
+       char       *path = psprintf("%s/lo_shdep.out", 
realpath(log_opts.dumpdir, NULL));
+       PQExpBuffer buf = createPQExpBuffer();
+       PGconn     *conn;
        int                     dbnum;
 
        prep_status("Creating dump of global objects");
@@ -70,5 +73,11 @@ generate_old_dump(void)
                ;
 
        end_progress_output();
+
+       conn = connectToServer(&old_cluster, "template1");
+       appendStringLiteralConn(buf, path, conn);
+       PQclear(executeQueryOrDie(conn, "COPY (SELECT * FROM pg_shdepend WHERE 
classid = 'pg_largeobject'::regclass) TO %s", buf->data));
+       destroyPQExpBuffer(buf);
+       PQfinish(conn);
        check_ok();
 }
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 536e49d2616..7a6234d48c0 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -571,6 +571,8 @@ prepare_new_globals(void)
 static void
 create_new_objects(void)
 {
+       char       *path = psprintf("%s/lo_shdep.out", 
realpath(log_opts.dumpdir, NULL));
+       PQExpBuffer buf = createPQExpBuffer();
        int                     dbnum;
        PGconn     *conn_new_template1;
 
@@ -688,6 +690,13 @@ create_new_objects(void)
                ;
 
        end_progress_output();
+
+       conn_new_template1 = connectToServer(&new_cluster, "template1");
+       appendStringLiteralConn(buf, path, conn_new_template1);
+       PQclear(executeQueryOrDie(conn_new_template1, "COPY pg_shdepend FROM 
%s", buf->data));
+       destroyPQExpBuffer(buf);
+       PQfinish(conn_new_template1);
+
        check_ok();
 
        /*
-- 
2.39.5 (Apple Git-154)

Re: Horribly slow pg_upgrade performance with many Large Objects

Reply via email to