The earlier patches weren't applying because I had "git config diff.noprefix true" set globally and that was messing up the git format-patch output.
On Mon, May 25, 2020 at 01:54:29PM -0500, David Gilman wrote: > And I misunderstood how bad it was. I thought it was reading little > header structs off the disk but it's actually reading the entire table > (see _skipData). So you're quadratically rereading entire tables and > thrashing your cache. Oops. I changed _skipData to fseeko() instead of fread() when possible to cut down on this thrashing further. -- David Gilman :DG< https://gilslotd.com
>From 5ffc0420bc6040cc26b1a36ac522b696ef428464 Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Fri, 22 May 2020 17:27:51 -0400 Subject: [PATCH 1/4] Remove unused seek check in tar dump format --- src/bin/pg_dump/pg_backup_tar.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index d5bfa55646..bb5d3b1f45 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -82,7 +82,6 @@ typedef struct typedef struct { - int hasSeek; pgoff_t filePos; TAR_MEMBER *blobToc; FILE *tarFH; @@ -192,8 +191,6 @@ InitArchiveFmt_Tar(ArchiveHandle *AH) */ /* setvbuf(ctx->tarFH, NULL, _IONBF, 0); */ - ctx->hasSeek = checkSeek(ctx->tarFH); - /* * We don't support compression because reading the files back is not * possible since gzdopen uses buffered IO which totally screws file @@ -226,8 +223,6 @@ InitArchiveFmt_Tar(ArchiveHandle *AH) ctx->tarFHpos = 0; - ctx->hasSeek = checkSeek(ctx->tarFH); - /* * Forcibly unmark the header as read since we use the lookahead * buffer -- 2.26.2
>From 8687d3947cab7ffadc0c7bdc61e19c4b8a94da2d Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Mon, 25 May 2020 17:34:52 -0400 Subject: [PATCH 2/4] Skip tables in pg_restore by seeking instead of reading --- src/bin/pg_dump/pg_backup_custom.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 369dcea429..8c84e3c611 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -562,19 +562,27 @@ _skipData(ArchiveHandle *AH) blkLen = ReadInt(AH); while (blkLen != 0) { - if (blkLen > buflen) + if (ctx->hasSeek) { - if (buf) - free(buf); - buf = (char *) pg_malloc(blkLen); - buflen = blkLen; + if (fseeko(AH->FH, blkLen, SEEK_CUR) != 0) + fatal("error during file seek: %m"); } - if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen) + else { - if (feof(AH->FH)) - fatal("could not read from input file: end of file"); - else - fatal("could not read from input file: %m"); + if (blkLen > buflen) + { + if (buf) + free(buf); + buf = (char *) pg_malloc(blkLen); + buflen = blkLen; + } + if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen) + { + if (feof(AH->FH)) + fatal("could not read from input file: end of file"); + else + fatal("could not read from input file: %m"); + } } ctx->filePos += blkLen; -- 2.26.2
>From ed106edbfd2eeeadec74289afb8d8bfe8f37fff1 Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Wed, 20 May 2020 22:49:28 -0400 Subject: [PATCH 3/4] Scan all TOCs when restoring a custom dump file without offsets TOC requests are not guaranteed to come in disk order. If the custom dump file was written with data offsets, pg_restore can seek directly to the data, making request order irrelevant. If there are no data offsets pg_restore would never attempt to seek backwards, even when it was possible, and would not find TOCs before the current read position in the file. 548e50976 changed how pg_restore's parallel algorithm worked at the cost of greatly increasing out-of-order TOC requests. This patch changes pg_restore to scan through all TOCs to service a TOC read request when restoring a custom dump file without data offsets. The odds of getting a successful parallel restore go way up at the cost of a bunch of extra tiny reads when pg_restore starts up. The pg_restore manpage now warns against running pg_dump with an unseekable output file and suggests that if you plan on doing a parallel restore of a custom dump you should run pg_dump with --file. --- doc/src/sgml/ref/pg_restore.sgml | 8 ++++++++ src/bin/pg_dump/pg_backup_custom.c | 24 ++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index 232f88024f..a0bf64767b 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -279,6 +279,14 @@ PostgreSQL documentation jobs cannot be used together with the option <option>--single-transaction</option>. </para> + + <para> + <application>pg_restore</application> with concurrent jobs may fail + when restoring a custom archive format dump written to an unseekable + output stream, like stdout. For the best performance when restoring + a custom archive format dump use <application>pg_dump</application>'s + <option>--file</option> option to specify an output file. + </para> </listitem> </varlistentry> diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 8c84e3c611..851add4915 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -415,6 +415,7 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te) lclTocEntry *tctx = (lclTocEntry *) te->formatData; int blkType; int id; + bool initialScan = true; if (tctx->dataState == K_OFFSET_NO_DATA) return; @@ -423,11 +424,19 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te) { /* * We cannot seek directly to the desired block. Instead, skip over - * block headers until we find the one we want. This could fail if we - * are asked to restore items out-of-order. + * block headers until we find the one we want. */ + _readBlockHeader(AH, &blkType, &id); + if (blkType == EOF && ctx->hasSeek) { + /* Started at the end of the file */ + initialScan = false; + if (fseeko(AH->FH, ctx->dataStart, SEEK_SET) != 0) + fatal("error during file seek: %m"); + _readBlockHeader(AH, &blkType, &id); + } + while (blkType != EOF && id != te->dumpId) { switch (blkType) @@ -446,6 +455,17 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te) break; } _readBlockHeader(AH, &blkType, &id); + + if (blkType == EOF && ctx->hasSeek && initialScan) { + /* + * This was possibly an out-of-order request. + * Try one extra pass over the file to find it. + */ + initialScan = false; + if (fseeko(AH->FH, ctx->dataStart, SEEK_SET) != 0) + fatal("error during file seek: %m"); + _readBlockHeader(AH, &blkType, &id); + } } } else -- 2.26.2
>From 69d7a9bafa342dc558c0d8efcee1af4b9e7e07d5 Mon Sep 17 00:00:00 2001 From: David Gilman <davidgilm...@gmail.com> Date: Sat, 23 May 2020 10:49:33 -0400 Subject: [PATCH 4/4] Add integration test for out-of-order TOC requests in pg_restore Add undocumented --disable-seeking argument to pg_dump to emulate writing to an unseekable output file. --- src/bin/pg_dump/pg_backup.h | 4 +- src/bin/pg_dump/pg_backup_archiver.c | 14 ++-- src/bin/pg_dump/pg_backup_archiver.h | 1 + src/bin/pg_dump/pg_backup_custom.c | 7 +- src/bin/pg_dump/pg_dump.c | 6 +- src/bin/pg_dump/t/002_pg_dump.pl | 97 +++++++++++++++++++++++++++- 6 files changed, 117 insertions(+), 12 deletions(-) diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 8c0cedcd98..160d705eb5 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -158,6 +158,7 @@ typedef struct _dumpOptions int use_setsessauth; int enable_row_security; int load_via_partition_root; + int disable_seeking; /* default, if no "inclusion" switches appear, is to dump everything */ bool include_everything; @@ -270,7 +271,8 @@ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); /* Create a new archive */ extern Archive *CreateArchive(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupDumpWorker); + SetupWorkerPtrType setupDumpWorker, + bool disableSeeking); /* The --list option */ extern void PrintTOCSummary(Archive *AH); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 8f0b32ca17..0e5d4e0a60 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -69,7 +69,8 @@ typedef struct _parallelReadyList static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupWorkerPtr); + SetupWorkerPtrType setupWorkerPtr, + bool disableSeeking); static void _getObjectDescription(PQExpBuffer buf, TocEntry *te, ArchiveHandle *AH); static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, bool isData); @@ -233,11 +234,12 @@ setupRestoreWorker(Archive *AHX) Archive * CreateArchive(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupDumpWorker) + SetupWorkerPtrType setupDumpWorker, + bool disableSeeking) { ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression, dosync, - mode, setupDumpWorker); + mode, setupDumpWorker, disableSeeking); return (Archive *) AH; } @@ -247,7 +249,7 @@ CreateArchive(const char *FileSpec, const ArchiveFormat fmt, Archive * OpenArchive(const char *FileSpec, const ArchiveFormat fmt) { - ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, true, archModeRead, setupRestoreWorker); + ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, true, archModeRead, setupRestoreWorker, false); return (Archive *) AH; } @@ -2274,7 +2276,8 @@ _discoverArchiveFormat(ArchiveHandle *AH) static ArchiveHandle * _allocAH(const char *FileSpec, const ArchiveFormat fmt, const int compression, bool dosync, ArchiveMode mode, - SetupWorkerPtrType setupWorkerPtr) + SetupWorkerPtrType setupWorkerPtr, + bool disableSeeking) { ArchiveHandle *AH; @@ -2325,6 +2328,7 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, AH->mode = mode; AH->compression = compression; AH->dosync = dosync; + AH->disableSeeking = disableSeeking; memset(&(AH->sqlparse), 0, sizeof(AH->sqlparse)); diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index f9e6b42752..acf38c20da 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -340,6 +340,7 @@ struct _archiveHandle bool dosync; /* data requested to be synced on sight */ ArchiveMode mode; /* File mode - r or w */ void *formatData; /* Header data specific to file format */ + bool disableSeeking; /* Don't use fseeko() */ /* these vars track state to avoid sending redundant SET commands */ char *currUser; /* current username, or NULL if unknown */ diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index 851add4915..79445ac363 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -145,6 +145,7 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) AH->lo_buf = (void *) pg_malloc(LOBBUFSIZE); ctx->filePos = 0; + ctx->hasSeek = 0; /* * Now open the file @@ -164,7 +165,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) fatal("could not open output file: %m"); } - ctx->hasSeek = checkSeek(AH->FH); + if (!AH->disableSeeking) + ctx->hasSeek = checkSeek(AH->FH); } else { @@ -181,7 +183,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) fatal("could not open input file: %m"); } - ctx->hasSeek = checkSeek(AH->FH); + if (!AH->disableSeeking) + ctx->hasSeek = checkSeek(AH->FH); ReadHead(AH); ReadToc(AH); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index a4e949c636..6ed45d319f 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -319,6 +319,7 @@ main(int argc, char **argv) int plainText = 0; ArchiveFormat archiveFormat = archUnknown; ArchiveMode archiveMode; + bool disableSeeking = false; static DumpOptions dopt; @@ -360,6 +361,7 @@ main(int argc, char **argv) {"binary-upgrade", no_argument, &dopt.binary_upgrade, 1}, {"column-inserts", no_argument, &dopt.column_inserts, 1}, {"disable-dollar-quoting", no_argument, &dopt.disable_dollar_quoting, 1}, + {"disable-seeking", no_argument, &dopt.disable_seeking, 1}, {"disable-triggers", no_argument, &dopt.disable_triggers, 1}, {"enable-row-security", no_argument, &dopt.enable_row_security, 1}, {"exclude-table-data", required_argument, NULL, 4}, @@ -673,6 +675,8 @@ main(int argc, char **argv) if (archiveFormat == archNull) plainText = 1; + disableSeeking = (bool) dopt.disable_seeking; + /* Custom and directory formats are compressed by default, others not */ if (compressLevel == -1) { @@ -715,7 +719,7 @@ main(int argc, char **argv) /* Open the output file */ fout = CreateArchive(filename, archiveFormat, compressLevel, dosync, - archiveMode, setupDumpWorker); + archiveMode, setupDumpWorker, disableSeeking); /* Make dump options accessible right away */ SetArchiveOptions(fout, &dopt, NULL); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index e116235769..2423ec5b1d 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -31,9 +31,20 @@ my $tempdir_short = TestLib::tempdir_short; # generate a text file to run through the tests from the # non-text file generated by pg_dump. # -# TODO: Have pg_restore actually restore to an independent -# database and then pg_dump *that* database (or something along -# those lines) to validate that part of the process. +# secondary_dump_cmd is an optional key that enables a +# +# pg_dump (dump_cmd) -> +# pg_restore (restore_cmd, real database) -> +# pg_dump (secondary_dmp_cmd, SQL output (for matching)) +# +# test process instead of the default +# +# pg_dump (dump_cmd) -> +# pg_restore (restore_cmd, SQL output (for matching)) +# +# test process. The secondary_dump database is created for +# you and your restore_cmd and secondary_dmp_cmd must read +# and write from it. my %pgdump_runs = ( binary_upgrade => { @@ -139,6 +150,23 @@ my %pgdump_runs = ( "$tempdir/defaults_custom_format.dump", ], }, + defaults_custom_format_no_seek_parallel_restore => { + test_key => 'defaults', + dump_cmd => [ + 'pg_dump', '-Fc', '-Z6', '--no-sync', '--disable-seeking', + "--file=$tempdir/defaults_custom_format_no_seek_parallel_restore.dump", 'postgres', + ], + restore_cmd => [ + 'pg_restore', '-Fc', '--jobs=2', + '--dbname=secondary_dump', + "$tempdir/defaults_custom_format_no_seek_parallel_restore.dump", + ], + secondary_dump_cmd => [ + 'pg_dump', '-Fp', '--no-sync', + "--file=$tempdir/defaults_custom_format_no_seek_parallel_restore.sql", + 'secondary_dump', + ], + }, # Do not use --no-sync to give test coverage for data sync. defaults_dir_format => { @@ -153,6 +181,24 @@ my %pgdump_runs = ( "$tempdir/defaults_dir_format", ], }, + defaults_dir_format_parallel => { + test_key => 'defaults', + dump_cmd => [ + 'pg_dump', '-Fd', + '--jobs=2', '--no-sync', + "--file=$tempdir/defaults_dir_format_parallel", 'postgres', + ], + restore_cmd => [ + 'pg_restore', '-Fd', + '--jobs=2', '--dbname=secondary_dump', + "$tempdir/defaults_dir_format_parallel", + ], + secondary_dump_cmd => [ + 'pg_dump', '-Fp', '--no-sync', + "--file=$tempdir/defaults_dir_format_parallel.sql", + 'secondary_dump', + ], + }, # Do not use --no-sync to give test coverage for data sync. defaults_parallel => { @@ -3298,6 +3344,27 @@ my %tests = ( %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, unlike => { exclude_dump_test_schema => 1 }, + }, + + # This reliably produces the "possibly due to out-of-order restore request" + # when restoring with -j 2 when this was written but future changes to how + # pg_dump works could accidentally correctly order things as we're not really + # telling pg_dump how to do its on-disk layout. + 'custom dump out-of-order' => { + create_sql => ' + CREATE TABLE ooo_parent0 (id integer primary key); + CREATE TABLE ooo_child0 (parent0 int references ooo_parent0 (id)); + CREATE TABLE ooo_parent1 (id integer primary key); + CREATE TABLE ooo_child1 (parent0 int references ooo_parent1 (id));', + regexp => qr/^ + \QCREATE TABLE public.ooo_child0 (\E\n + \s+\Qparent0 integer\E\n + \Q);\E\n/xm, + like => { + %full_runs, section_pre_data => 1, + defaults_custom_format_no_seek => 1, + defaults_custom_format_no_seek_parallel_restore => 1, + }, }); ######################################### @@ -3350,6 +3417,11 @@ foreach my $run (sort keys %pgdump_runs) $num_tests++; } + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $num_tests++; + } + if ($pgdump_runs{$run}->{test_key}) { $test_key = $pgdump_runs{$run}->{test_key}; @@ -3499,12 +3571,23 @@ foreach my $run (sort keys %pgdump_runs) $node->command_ok(\@{ $pgdump_runs{$run}->{dump_cmd} }, "$run: pg_dump runs"); + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $node->safe_psql('postgres', 'create database secondary_dump;'); + } + if ($pgdump_runs{$run}->{restore_cmd}) { $node->command_ok(\@{ $pgdump_runs{$run}->{restore_cmd} }, "$run: pg_restore runs"); } + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $node->command_ok(\@{ $pgdump_runs{$run}->{secondary_dump_cmd} }, + "$run: secondary pg_dump runs"); + } + if ($pgdump_runs{$run}->{test_key}) { $test_key = $pgdump_runs{$run}->{test_key}; @@ -3561,6 +3644,14 @@ foreach my $run (sort keys %pgdump_runs) } } } + + if ($pgdump_runs{$run}->{secondary_dump_cmd}) + { + $node->safe_psql('secondary_dump', + 'alter subscription sub1 set (slot_name = NONE);'); + $node->safe_psql('secondary_dump', 'drop subscription sub1;'); + $node->safe_psql('postgres', 'drop database secondary_dump;'); + } } ######################################### -- 2.26.2