I've attached the latest patches after further review from Justin Pryzby. -- David Gilman :DG< https://gilslotd.com
>From 90e06cbb724f6f6a244dfc69f3d59ca2e7d29c01 Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Wed, 20 May 2020 22:49:28 -0400 Subject: [PATCH 1/4] Scan all TOCs when restoring a custom dump file without offsets
TOC requests are not guaranteed to come in disk order. If the custom dump file was written with data offsets, pg_restore can seek directly to the data, making request order irrelevant. If there are no data offsets, pg_restore would never attempt to seek backwards, even when seeking is possible, and would be unable to find TOCs before the current read position in the file. 548e50976 changed how pg_restore's parallel algorithm worked at the cost of greatly increasing out-of-order TOC requests. This patch changes pg_restore to scan through all TOCs to service a TOC read request when restoring a custom dump file without data offsets. The odds of getting a successful parallel restore go way up at the cost of a bunch of extra tiny reads when pg_restore starts up. The pg_restore manpage now warns against running pg_dump with an unseekable output file and suggests that if you plan on doing a parallel restore of a custom dump, you should run pg_dump with --file. --- doc/src/sgml/ref/pg_restore.sgml | 8 ++++++++ src/bin/pg_dump/pg_backup_custom.c | 25 ++++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index 232f88024f..23286bb076 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -279,6 +279,14 @@ PostgreSQL documentation jobs cannot be used together with the option <option>--single-transaction</option>. </para> + + <para> + <application>pg_restore</application> with concurrent jobs may fail + when restoring a custom archive format dump written to an unseekable + output stream, like stdout. To allow for concurrent restoration of + a custom archive format dump, use <application>pg_dump</application>'s + <option>--file</option> option to specify an output file. + </para> </listitem> </varlistentry> diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 369dcea429..5aa7ab33db 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -415,6 +415,7 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te) lclTocEntry *tctx = (lclTocEntry *) te->formatData; int blkType; int id; + bool initialScan = true; if (tctx->dataState == K_OFFSET_NO_DATA) return; @@ -423,13 +424,28 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te) { /* * We cannot seek directly to the desired block. Instead, skip over - * block headers until we find the one we want. This could fail if we - * are asked to restore items out-of-order. + * block headers until we find the one we want. */ - _readBlockHeader(AH, &blkType, &id); - while (blkType != EOF && id != te->dumpId) + for (;;) { + _readBlockHeader(AH, &blkType, &id); + + if (blkType == EOF && ctx->hasSeek && initialScan) + { + /* + * This was possibly an out-of-order request. Try one extra + * pass over the file to find the TOC. + */ + initialScan = false; + if (fseeko(AH->FH, ctx->dataStart, SEEK_SET) != 0) + fatal("error during file seek: %m"); + continue; + } + + if (blkType == EOF || id == te->dumpId) + break; + switch (blkType) { case BLK_DATA: @@ -445,7 +461,6 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te) blkType); break; } - _readBlockHeader(AH, &blkType, &id); } } else -- 2.26.2
>From 750958499b19e6295a15b01ccb3a9e3ce963af2c Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Sat, 23 May 2020 10:49:33 -0400 Subject: [PATCH 2/4] Add integration test for out-of-order TOC requests in pg_restore Add undocumented --disable-seeking argument to pg_dump to emulate writing to an unseekable output file. --- src/bin/pg_dump/pg_backup.h | 4 +- src/bin/pg_dump/pg_backup_archiver.c | 14 ++-- src/bin/pg_dump/pg_backup_archiver.h | 1 + src/bin/pg_dump/pg_backup_custom.c | 7 +- src/bin/pg_dump/pg_dump.c | 6 +- src/bin/pg_dump/t/002_pg_dump.pl | 97 +++++++++++++++++++++++++++- 6 files changed, 117 insertions(+), 12 deletions(-) diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 8c0cedcd98..160d705eb5 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -158,6 +158,7 @@ typedef struct _dumpOptions int use_setsessauth; int enable_row_security; int load_via_partition_root; + int disable_seeking; /* default, if no "inclusion" switches appear, is to dump everything */ bool include_everything; @@ -270,7 +271,8 @@ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); /* Create a new archive */ extern Archive *CreateArchive(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupDumpWorker); + SetupWorkerPtrType setupDumpWorker, + bool disableSeeking); /* The --list option */ extern void PrintTOCSummary(Archive *AH); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 8f0b32ca17..0e5d4e0a60 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -69,7 +69,8 @@ typedef struct _parallelReadyList static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupWorkerPtr); + SetupWorkerPtrType setupWorkerPtr, + bool disableSeeking); static void _getObjectDescription(PQExpBuffer buf, TocEntry *te, ArchiveHandle *AH); static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, bool isData); @@ -233,11 +234,12 @@ setupRestoreWorker(Archive *AHX) Archive * CreateArchive(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupDumpWorker) + SetupWorkerPtrType setupDumpWorker, + bool disableSeeking) { ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression, dosync, - mode, setupDumpWorker); + mode, setupDumpWorker, disableSeeking); return (Archive *) AH; } @@ -247,7 +249,7 @@ CreateArchive(const char *FileSpec, const ArchiveFormat fmt, Archive * OpenArchive(const char *FileSpec, const ArchiveFormat fmt) { - ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, true, archModeRead, setupRestoreWorker); + ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, true, archModeRead, setupRestoreWorker, false); return (Archive *) AH; } @@ -2274,7 +2276,8 @@ _discoverArchiveFormat(ArchiveHandle *AH) static ArchiveHandle * _allocAH(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupWorkerPtr) + SetupWorkerPtrType setupWorkerPtr, + bool disableSeeking) { ArchiveHandle *AH; @@ -2325,6 +2328,7 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, AH->mode = mode; AH->compression = compression; AH->dosync = dosync; + AH->disableSeeking = disableSeeking; memset(&(AH->sqlparse), 0, sizeof(AH->sqlparse)); diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index f9e6b42752..acf38c20da 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -340,6 +340,7 @@ struct _archiveHandle bool dosync; /* data requested to be synced on sight */ ArchiveMode mode; /* File mode - r or w */ void *formatData; /* Header data specific to file format */ + bool disableSeeking; /* Don't use fseeko() */ /* these vars track state to avoid sending redundant SET commands */ char *currUser; /* current username, or NULL if unknown */ diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 5aa7ab33db..90a026e0f4 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -145,6 +145,7 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) AH->lo_buf = (void *) pg_malloc(LOBBUFSIZE); ctx->filePos = 0; + ctx->hasSeek = 0; /* * Now open the file @@ -164,7 +165,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) fatal("could not open output file: %m"); } - ctx->hasSeek = checkSeek(AH->FH); + if (!AH->disableSeeking) + ctx->hasSeek = checkSeek(AH->FH); } else { @@ -181,7 +183,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) fatal("could not open input file: %m"); } - ctx->hasSeek = checkSeek(AH->FH); + if (!AH->disableSeeking) + ctx->hasSeek = checkSeek(AH->FH); ReadHead(AH); ReadToc(AH); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index a4e949c636..6ed45d319f 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -319,6 +319,7 @@ main(int argc, char **argv) int plainText = 0; ArchiveFormat archiveFormat = archUnknown; ArchiveMode archiveMode; + bool disableSeeking = false; static DumpOptions dopt; @@ -360,6 +361,7 @@ main(int argc, char **argv) {"binary-upgrade", no_argument, &dopt.binary_upgrade, 1}, {"column-inserts", no_argument, &dopt.column_inserts, 1}, {"disable-dollar-quoting", no_argument, &dopt.disable_dollar_quoting, 1}, + {"disable-seeking", no_argument, &dopt.disable_seeking, 1}, {"disable-triggers", no_argument, &dopt.disable_triggers, 1}, {"enable-row-security", no_argument, &dopt.enable_row_security, 1}, {"exclude-table-data", required_argument, NULL, 4}, @@ -673,6 +675,8 @@ main(int argc, char **argv) if (archiveFormat == archNull) plainText = 1; + disableSeeking = (bool) dopt.disable_seeking; + /* Custom and directory formats are compressed by default, others not */ if (compressLevel == -1) { @@ -715,7 +719,7 @@ main(int argc, char **argv) /* Open the output file */ fout = CreateArchive(filename, archiveFormat, compressLevel, dosync, - archiveMode, setupDumpWorker); + archiveMode, setupDumpWorker, disableSeeking); /* Make dump options accessible right away */ SetArchiveOptions(fout, &dopt, NULL); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index e116235769..2423ec5b1d 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -31,9 +31,20 @@ my $tempdir_short = TestLib::tempdir_short; # generate a text file to run through the tests from the # non-text file generated by pg_dump. # -# TODO: Have pg_restore actually restore to an independent -# database and then pg_dump *that* database (or something along -# those lines) to validate that part of the process. +# secondary_dump_cmd is an optional key that enables a +# +# pg_dump (dump_cmd) -> +# pg_restore (restore_cmd, real database) -> +# pg_dump (secondary_dmp_cmd, SQL output (for matching)) +# +# test process instead of the default +# +# pg_dump (dump_cmd) -> +# pg_restore (restore_cmd, SQL output (for matching)) +# +# test process. The secondary_dump database is created for +# you and your restore_cmd and secondary_dmp_cmd must read +# and write from it. my %pgdump_runs = ( binary_upgrade => { @@ -139,6 +150,23 @@ my %pgdump_runs = ( "$tempdir/defaults_custom_format.dump", ], }, + defaults_custom_format_no_seek_parallel_restore => { + test_key => 'defaults', + dump_cmd => [ + 'pg_dump', '-Fc', '-Z6', '--no-sync', '--disable-seeking', + "--file=$tempdir/defaults_custom_format_no_seek_parallel_restore.dump", 'postgres', + ], + restore_cmd => [ + 'pg_restore', '-Fc', '--jobs=2', + '--dbname=secondary_dump', + "$tempdir/defaults_custom_format_no_seek_parallel_restore.dump", + ], + secondary_dump_cmd => [ + 'pg_dump', '-Fp', '--no-sync', + "--file=$tempdir/defaults_custom_format_no_seek_parallel_restore.sql", + 'secondary_dump', + ], + }, # Do not use --no-sync to give test coverage for data sync. defaults_dir_format => { @@ -153,6 +181,24 @@ my %pgdump_runs = ( "$tempdir/defaults_dir_format", ], }, + defaults_dir_format_parallel => { + test_key => 'defaults', + dump_cmd => [ + 'pg_dump', '-Fd', + '--jobs=2', '--no-sync', + "--file=$tempdir/defaults_dir_format_parallel", 'postgres', + ], + restore_cmd => [ + 'pg_restore', '-Fd', + '--jobs=2', '--dbname=secondary_dump', + "$tempdir/defaults_dir_format_parallel", + ], + secondary_dump_cmd => [ + 'pg_dump', '-Fp', '--no-sync', + "--file=$tempdir/defaults_dir_format_parallel.sql", + 'secondary_dump', + ], + }, # Do not use --no-sync to give test coverage for data sync. defaults_parallel => { @@ -3298,6 +3344,27 @@ my %tests = ( %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, unlike => { exclude_dump_test_schema => 1 }, + }, + + # This reliably produces the "possibly due to out-of-order restore request" + # when restoring with -j 2 when this was written but future changes to how + # pg_dump works could accidentally correctly order things as we're not really + # telling pg_dump how to do its on-disk layout. + 'custom dump out-of-order' => { + create_sql => ' + CREATE TABLE ooo_parent0 (id integer primary key); + CREATE TABLE ooo_child0 (parent0 int references ooo_parent0 (id)); + CREATE TABLE ooo_parent1 (id integer primary key); + CREATE TABLE ooo_child1 (parent0 int references ooo_parent1 (id));', + regexp => qr/^ + \QCREATE TABLE public.ooo_child0 (\E\n + \s+\Qparent0 integer\E\n + \Q);\E\n/xm, + like => { + %full_runs, section_pre_data => 1, + defaults_custom_format_no_seek => 1, + defaults_custom_format_no_seek_parallel_restore => 1, + }, }); ######################################### @@ -3350,6 +3417,11 @@ foreach my $run (sort keys %pgdump_runs) $num_tests++; } + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $num_tests++; + } + if ($pgdump_runs{$run}->{test_key}) { $test_key = $pgdump_runs{$run}->{test_key}; @@ -3499,12 +3571,23 @@ foreach my $run (sort keys %pgdump_runs) $node->command_ok(\@{ $pgdump_runs{$run}->{dump_cmd} }, "$run: pg_dump runs"); + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $node->safe_psql('postgres', 'create database secondary_dump;'); + } + if ($pgdump_runs{$run}->{restore_cmd}) { $node->command_ok(\@{ $pgdump_runs{$run}->{restore_cmd} }, "$run: pg_restore runs"); } + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $node->command_ok(\@{ $pgdump_runs{$run}->{secondary_dump_cmd} }, + "$run: secondary pg_dump runs"); + } + if ($pgdump_runs{$run}->{test_key}) { $test_key = $pgdump_runs{$run}->{test_key}; @@ -3561,6 +3644,14 @@ foreach my $run (sort keys %pgdump_runs) } } } + + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $node->safe_psql('secondary_dump', + 'alter subscription sub1 set (slot_name = NONE);'); + $node->safe_psql('secondary_dump', 'drop subscription sub1;'); + $node->safe_psql('postgres', 'drop database secondary_dump;'); + } } ######################################### -- 2.26.2
>From 35d05669a0455bb928e51ea532a791e7f097987d Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Fri, 22 May 2020 17:27:51 -0400 Subject: [PATCH 3/4] Remove unused seek check in tar dump format --- src/bin/pg_dump/pg_backup_tar.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index d5bfa55646..bb5d3b1f45 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -82,7 +82,6 @@ typedef struct typedef struct { - int hasSeek; pgoff_t filePos; TAR_MEMBER *blobToc; FILE *tarFH; @@ -192,8 +191,6 @@ InitArchiveFmt_Tar(ArchiveHandle *AH) */ /* setvbuf(ctx->tarFH, NULL, _IONBF, 0); */ - ctx->hasSeek = checkSeek(ctx->tarFH); - /* * We don't support compression because reading the files back is not * possible since gzdopen uses buffered IO which totally screws file @@ -226,8 +223,6 @@ InitArchiveFmt_Tar(ArchiveHandle *AH) ctx->tarFHpos = 0; - ctx->hasSeek = checkSeek(ctx->tarFH); - /* * Forcibly unmark the header as read since we use the lookahead * buffer -- 2.26.2
>From aabcd1d843f063b7f9de2e44df7c6147226ad389 Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Mon, 25 May 2020 17:34:52 -0400 Subject: [PATCH 4/4] Skip tables in pg_restore by seeking instead of reading --- src/bin/pg_dump/pg_backup_custom.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 90a026e0f4..662e7a1793 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -580,19 +580,27 @@ _skipData(ArchiveHandle *AH) blkLen = ReadInt(AH); while (blkLen != 0) { - if (blkLen > buflen) + if (ctx->hasSeek) { - if (buf) - free(buf); - buf = (char *) pg_malloc(blkLen); - buflen = blkLen; + if (fseeko(AH->FH, blkLen, SEEK_CUR) != 0) + fatal("error during file seek: %m"); } - if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen) + else { - if (feof(AH->FH)) - fatal("could not read from input file: end of file"); - else - fatal("could not read from input file: %m"); + if (blkLen > buflen) + { + if (buf) + free(buf); + buf = (char *) pg_malloc(blkLen); + buflen = blkLen; + } + if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen) + { + if (feof(AH->FH)) + fatal("could not read from input file: end of file"); + else + fatal("could not read from input file: %m"); + } } ctx->filePos += blkLen; -- 2.26.2