The earlier patches weren't applying because I had "git config
diff.noprefix true" set globally and that was messing up the git
format-patch output.

On Mon, May 25, 2020 at 01:54:29PM -0500, David Gilman wrote:
> And I misunderstood how bad it was. I thought it was reading little
> header structs off the disk but it's actually reading the entire table
> (see _skipData). So you're quadratically rereading entire tables and
> thrashing your cache. Oops.

I changed _skipData to fseeko() instead of fread() when possible to cut
down on this thrashing further.

-- 
David Gilman  :DG<
https://gilslotd.com
>From 5ffc0420bc6040cc26b1a36ac522b696ef428464 Mon Sep 17 00:00:00 2001
From: David Gilman <davidgilm...@gmail.com>
Date: Fri, 22 May 2020 17:27:51 -0400
Subject: [PATCH 1/4] Remove unused seek check in tar dump format

---
 src/bin/pg_dump/pg_backup_tar.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index d5bfa55646..bb5d3b1f45 100644
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -82,7 +82,6 @@ typedef struct
 
 typedef struct
 {
-	int			hasSeek;
 	pgoff_t		filePos;
 	TAR_MEMBER *blobToc;
 	FILE	   *tarFH;
@@ -192,8 +191,6 @@ InitArchiveFmt_Tar(ArchiveHandle *AH)
 		 */
 		/* setvbuf(ctx->tarFH, NULL, _IONBF, 0); */
 
-		ctx->hasSeek = checkSeek(ctx->tarFH);
-
 		/*
 		 * We don't support compression because reading the files back is not
 		 * possible since gzdopen uses buffered IO which totally screws file
@@ -226,8 +223,6 @@ InitArchiveFmt_Tar(ArchiveHandle *AH)
 
 		ctx->tarFHpos = 0;
 
-		ctx->hasSeek = checkSeek(ctx->tarFH);
-
 		/*
 		 * Forcibly unmark the header as read since we use the lookahead
 		 * buffer
-- 
2.26.2

>From 8687d3947cab7ffadc0c7bdc61e19c4b8a94da2d Mon Sep 17 00:00:00 2001
From: David Gilman <davidgilm...@gmail.com>
Date: Mon, 25 May 2020 17:34:52 -0400
Subject: [PATCH 2/4] Skip tables in pg_restore by seeking instead of reading

---
 src/bin/pg_dump/pg_backup_custom.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c
index 369dcea429..8c84e3c611 100644
--- a/src/bin/pg_dump/pg_backup_custom.c
+++ b/src/bin/pg_dump/pg_backup_custom.c
@@ -562,19 +562,27 @@ _skipData(ArchiveHandle *AH)
 	blkLen = ReadInt(AH);
 	while (blkLen != 0)
 	{
-		if (blkLen > buflen)
+		if (ctx->hasSeek)
 		{
-			if (buf)
-				free(buf);
-			buf = (char *) pg_malloc(blkLen);
-			buflen = blkLen;
+			if (fseeko(AH->FH, blkLen, SEEK_CUR) != 0)
+				fatal("error during file seek: %m");
 		}
-		if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen)
+		else
 		{
-			if (feof(AH->FH))
-				fatal("could not read from input file: end of file");
-			else
-				fatal("could not read from input file: %m");
+			if (blkLen > buflen)
+			{
+				if (buf)
+					free(buf);
+				buf = (char *) pg_malloc(blkLen);
+				buflen = blkLen;
+			}
+			if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen)
+			{
+				if (feof(AH->FH))
+					fatal("could not read from input file: end of file");
+				else
+					fatal("could not read from input file: %m");
+			}
 		}
 
 		ctx->filePos += blkLen;
-- 
2.26.2

>From ed106edbfd2eeeadec74289afb8d8bfe8f37fff1 Mon Sep 17 00:00:00 2001
From: David Gilman <davidgilm...@gmail.com>
Date: Wed, 20 May 2020 22:49:28 -0400
Subject: [PATCH 3/4] Scan all TOCs when restoring a custom dump file without
 offsets

TOC requests are not guaranteed to come in disk order. If the custom
dump file was written with data offsets, pg_restore can seek directly to
the data, making request order irrelevant. If there are no data offsets
pg_restore would never attempt to seek backwards, even when it was
possible, and would not find TOCs before the current read position in
the file. 548e50976 changed how pg_restore's parallel algorithm worked
at the cost of greatly increasing out-of-order TOC requests.

This patch changes pg_restore to scan through all TOCs to service a TOC
read request when restoring a custom dump file without data offsets.
The odds of getting a successful parallel restore go way up at the cost
of a bunch of extra tiny reads when pg_restore starts up.

The pg_restore manpage now warns against running pg_dump with an
unseekable output file and suggests that if you plan on doing a parallel
restore of a custom dump you should run pg_dump with --file.
---
 doc/src/sgml/ref/pg_restore.sgml   |  8 ++++++++
 src/bin/pg_dump/pg_backup_custom.c | 24 ++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml
index 232f88024f..a0bf64767b 100644
--- a/doc/src/sgml/ref/pg_restore.sgml
+++ b/doc/src/sgml/ref/pg_restore.sgml
@@ -279,6 +279,14 @@ PostgreSQL documentation
         jobs cannot be used together with the
         option <option>--single-transaction</option>.
        </para>
+
+       <para>
+        <application>pg_restore</application> with concurrent jobs may fail
+        when restoring a custom archive format dump written to an unseekable
+        output stream, like stdout. For the best performance when restoring
+        a custom archive format dump use <application>pg_dump</application>'s
+        <option>--file</option> option to specify an output file.
+       </para>
       </listitem>
      </varlistentry>
 
diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c
index 8c84e3c611..851add4915 100644
--- a/src/bin/pg_dump/pg_backup_custom.c
+++ b/src/bin/pg_dump/pg_backup_custom.c
@@ -415,6 +415,7 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te)
 	lclTocEntry *tctx = (lclTocEntry *) te->formatData;
 	int			blkType;
 	int			id;
+	bool		initialScan = true;
 
 	if (tctx->dataState == K_OFFSET_NO_DATA)
 		return;
@@ -423,11 +424,19 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te)
 	{
 		/*
 		 * We cannot seek directly to the desired block.  Instead, skip over
-		 * block headers until we find the one we want.  This could fail if we
-		 * are asked to restore items out-of-order.
+		 * block headers until we find the one we want.
 		 */
+
 		_readBlockHeader(AH, &blkType, &id);
 
+		if (blkType == EOF && ctx->hasSeek) {
+			/* Started at the end of the file */
+			initialScan = false;
+			if (fseeko(AH->FH, ctx->dataStart, SEEK_SET) != 0)
+				fatal("error during file seek: %m");
+			_readBlockHeader(AH, &blkType, &id);
+		}
+
 		while (blkType != EOF && id != te->dumpId)
 		{
 			switch (blkType)
@@ -446,6 +455,17 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te)
 					break;
 			}
 			_readBlockHeader(AH, &blkType, &id);
+
+			if (blkType == EOF && ctx->hasSeek && initialScan) {
+				/*
+				 * This was possibly an out-of-order request.
+				 * Try one extra pass over the file to find it.
+				 */
+				initialScan = false;
+				if (fseeko(AH->FH, ctx->dataStart, SEEK_SET) != 0)
+					fatal("error during file seek: %m");
+				_readBlockHeader(AH, &blkType, &id);
+			}
 		}
 	}
 	else
-- 
2.26.2

>From 69d7a9bafa342dc558c0d8efcee1af4b9e7e07d5 Mon Sep 17 00:00:00 2001
From: David Gilman <davidgilm...@gmail.com>
Date: Sat, 23 May 2020 10:49:33 -0400
Subject: [PATCH 4/4] Add integration test for out-of-order TOC requests in
 pg_restore

Add undocumented --disable-seeking argument to pg_dump to emulate
writing to an unseekable output file.
---
 src/bin/pg_dump/pg_backup.h          |  4 +-
 src/bin/pg_dump/pg_backup_archiver.c | 14 ++--
 src/bin/pg_dump/pg_backup_archiver.h |  1 +
 src/bin/pg_dump/pg_backup_custom.c   |  7 +-
 src/bin/pg_dump/pg_dump.c            |  6 +-
 src/bin/pg_dump/t/002_pg_dump.pl     | 97 +++++++++++++++++++++++++++-
 6 files changed, 117 insertions(+), 12 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index 8c0cedcd98..160d705eb5 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -158,6 +158,7 @@ typedef struct _dumpOptions
 	int			use_setsessauth;
 	int			enable_row_security;
 	int			load_via_partition_root;
+	int			disable_seeking;
 
 	/* default, if no "inclusion" switches appear, is to dump everything */
 	bool		include_everything;
@@ -270,7 +271,8 @@ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt);
 /* Create a new archive */
 extern Archive *CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
 							  const int compression, bool dosync, ArchiveMode mode,
-							  SetupWorkerPtrType setupDumpWorker);
+							  SetupWorkerPtrType setupDumpWorker,
+							  bool disableSeeking);
 
 /* The --list option */
 extern void PrintTOCSummary(Archive *AH);
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 8f0b32ca17..0e5d4e0a60 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -69,7 +69,8 @@ typedef struct _parallelReadyList
 
 static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt,
 							   const int compression, bool dosync, ArchiveMode mode,
-							   SetupWorkerPtrType setupWorkerPtr);
+							   SetupWorkerPtrType setupWorkerPtr,
+							   bool disableSeeking);
 static void _getObjectDescription(PQExpBuffer buf, TocEntry *te,
 								  ArchiveHandle *AH);
 static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, bool isData);
@@ -233,11 +234,12 @@ setupRestoreWorker(Archive *AHX)
 Archive *
 CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
 			  const int compression, bool dosync, ArchiveMode mode,
-			  SetupWorkerPtrType setupDumpWorker)
+			  SetupWorkerPtrType setupDumpWorker,
+			  bool disableSeeking)
 
 {
 	ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression, dosync,
-								 mode, setupDumpWorker);
+								 mode, setupDumpWorker, disableSeeking);
 
 	return (Archive *) AH;
 }
@@ -247,7 +249,7 @@ CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
 Archive *
 OpenArchive(const char *FileSpec, const ArchiveFormat fmt)
 {
-	ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, true, archModeRead, setupRestoreWorker);
+	ArchiveHandle *AH = _allocAH(FileSpec, fmt, 0, true, archModeRead, setupRestoreWorker, false);
 
 	return (Archive *) AH;
 }
@@ -2274,7 +2276,8 @@ _discoverArchiveFormat(ArchiveHandle *AH)
 static ArchiveHandle *
 _allocAH(const char *FileSpec, const ArchiveFormat fmt,
 		 const int compression, bool dosync, ArchiveMode mode,
-		 SetupWorkerPtrType setupWorkerPtr)
+		 SetupWorkerPtrType setupWorkerPtr,
+		 bool disableSeeking)
 {
 	ArchiveHandle *AH;
 
@@ -2325,6 +2328,7 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt,
 	AH->mode = mode;
 	AH->compression = compression;
 	AH->dosync = dosync;
+	AH->disableSeeking = disableSeeking;
 
 	memset(&(AH->sqlparse), 0, sizeof(AH->sqlparse));
 
diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h
index f9e6b42752..acf38c20da 100644
--- a/src/bin/pg_dump/pg_backup_archiver.h
+++ b/src/bin/pg_dump/pg_backup_archiver.h
@@ -340,6 +340,7 @@ struct _archiveHandle
 	bool		dosync;			/* data requested to be synced on sight */
 	ArchiveMode mode;			/* File mode - r or w */
 	void	   *formatData;		/* Header data specific to file format */
+	bool		disableSeeking;	/* Don't use fseeko() */
 
 	/* these vars track state to avoid sending redundant SET commands */
 	char	   *currUser;		/* current username, or NULL if unknown */
diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c
index 851add4915..79445ac363 100644
--- a/src/bin/pg_dump/pg_backup_custom.c
+++ b/src/bin/pg_dump/pg_backup_custom.c
@@ -145,6 +145,7 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
 	AH->lo_buf = (void *) pg_malloc(LOBBUFSIZE);
 
 	ctx->filePos = 0;
+	ctx->hasSeek = 0;
 
 	/*
 	 * Now open the file
@@ -164,7 +165,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
 				fatal("could not open output file: %m");
 		}
 
-		ctx->hasSeek = checkSeek(AH->FH);
+		if (!AH->disableSeeking)
+			ctx->hasSeek = checkSeek(AH->FH);
 	}
 	else
 	{
@@ -181,7 +183,8 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
 				fatal("could not open input file: %m");
 		}
 
-		ctx->hasSeek = checkSeek(AH->FH);
+		if (!AH->disableSeeking)
+			ctx->hasSeek = checkSeek(AH->FH);
 
 		ReadHead(AH);
 		ReadToc(AH);
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index a4e949c636..6ed45d319f 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -319,6 +319,7 @@ main(int argc, char **argv)
 	int			plainText = 0;
 	ArchiveFormat archiveFormat = archUnknown;
 	ArchiveMode archiveMode;
+	bool		disableSeeking = false;
 
 	static DumpOptions dopt;
 
@@ -360,6 +361,7 @@ main(int argc, char **argv)
 		{"binary-upgrade", no_argument, &dopt.binary_upgrade, 1},
 		{"column-inserts", no_argument, &dopt.column_inserts, 1},
 		{"disable-dollar-quoting", no_argument, &dopt.disable_dollar_quoting, 1},
+		{"disable-seeking", no_argument, &dopt.disable_seeking, 1},
 		{"disable-triggers", no_argument, &dopt.disable_triggers, 1},
 		{"enable-row-security", no_argument, &dopt.enable_row_security, 1},
 		{"exclude-table-data", required_argument, NULL, 4},
@@ -673,6 +675,8 @@ main(int argc, char **argv)
 	if (archiveFormat == archNull)
 		plainText = 1;
 
+	disableSeeking = (bool) dopt.disable_seeking;
+
 	/* Custom and directory formats are compressed by default, others not */
 	if (compressLevel == -1)
 	{
@@ -715,7 +719,7 @@ main(int argc, char **argv)
 
 	/* Open the output file */
 	fout = CreateArchive(filename, archiveFormat, compressLevel, dosync,
-						 archiveMode, setupDumpWorker);
+						 archiveMode, setupDumpWorker, disableSeeking);
 
 	/* Make dump options accessible right away */
 	SetArchiveOptions(fout, &dopt, NULL);
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index e116235769..2423ec5b1d 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -31,9 +31,20 @@ my $tempdir_short = TestLib::tempdir_short;
 # generate a text file to run through the tests from the
 # non-text file generated by pg_dump.
 #
-# TODO: Have pg_restore actually restore to an independent
-# database and then pg_dump *that* database (or something along
-# those lines) to validate that part of the process.
+# secondary_dump_cmd is an optional key that enables a
+#
+# 	pg_dump (dump_cmd) ->
+# 	pg_restore (restore_cmd, real database) ->
+#	pg_dump (secondary_dmp_cmd, SQL output (for matching))
+#
+# test process instead of the default
+#
+# 	pg_dump (dump_cmd) ->
+# 	pg_restore (restore_cmd, SQL output (for matching))
+#
+# test process. The secondary_dump database is created for
+# you and your restore_cmd and secondary_dmp_cmd must read
+# and write from it.
 
 my %pgdump_runs = (
 	binary_upgrade => {
@@ -139,6 +150,23 @@ my %pgdump_runs = (
 			"$tempdir/defaults_custom_format.dump",
 		],
 	},
+	defaults_custom_format_no_seek_parallel_restore => {
+		test_key => 'defaults',
+		dump_cmd => [
+			'pg_dump', '-Fc', '-Z6', '--no-sync', '--disable-seeking',
+			"--file=$tempdir/defaults_custom_format_no_seek_parallel_restore.dump", 'postgres',
+		],
+		restore_cmd => [
+			'pg_restore', '-Fc', '--jobs=2',
+			'--dbname=secondary_dump',
+			"$tempdir/defaults_custom_format_no_seek_parallel_restore.dump",
+		],
+		secondary_dump_cmd => [
+			'pg_dump', '-Fp', '--no-sync',
+			"--file=$tempdir/defaults_custom_format_no_seek_parallel_restore.sql",
+			'secondary_dump',
+		],
+	},
 
 	# Do not use --no-sync to give test coverage for data sync.
 	defaults_dir_format => {
@@ -153,6 +181,24 @@ my %pgdump_runs = (
 			"$tempdir/defaults_dir_format",
 		],
 	},
+	defaults_dir_format_parallel => {
+		test_key => 'defaults',
+		dump_cmd => [
+			'pg_dump',                                      '-Fd',
+			'--jobs=2',                                     '--no-sync',
+			"--file=$tempdir/defaults_dir_format_parallel", 'postgres',
+		],
+		restore_cmd => [
+			'pg_restore', '-Fd',
+			'--jobs=2', '--dbname=secondary_dump',
+			"$tempdir/defaults_dir_format_parallel",
+		],
+		secondary_dump_cmd => [
+			'pg_dump', '-Fp', '--no-sync',
+			"--file=$tempdir/defaults_dir_format_parallel.sql",
+			'secondary_dump',
+		],
+	},
 
 	# Do not use --no-sync to give test coverage for data sync.
 	defaults_parallel => {
@@ -3298,6 +3344,27 @@ my %tests = (
 			%full_runs, %dump_test_schema_runs, section_pre_data => 1,
 		},
 		unlike => { exclude_dump_test_schema => 1 },
+	},
+
+	# This reliably produces the "possibly due to out-of-order restore request"
+	# when restoring with -j 2 when this was written but future changes to how
+	# pg_dump works could accidentally correctly order things as we're not really
+	# telling pg_dump how to do its on-disk layout.
+	'custom dump out-of-order' => {
+		create_sql   => '
+			CREATE TABLE ooo_parent0 (id integer primary key);
+			CREATE TABLE ooo_child0 (parent0 int references ooo_parent0 (id));
+			CREATE TABLE ooo_parent1 (id integer primary key);
+			CREATE TABLE ooo_child1 (parent0 int references ooo_parent1 (id));',
+		regexp => qr/^
+			 \QCREATE TABLE public.ooo_child0 (\E\n
+			 \s+\Qparent0 integer\E\n
+			 \Q);\E\n/xm,
+		like => {
+			%full_runs, section_pre_data => 1,
+			defaults_custom_format_no_seek => 1,
+			defaults_custom_format_no_seek_parallel_restore => 1,
+		},
 	});
 
 #########################################
@@ -3350,6 +3417,11 @@ foreach my $run (sort keys %pgdump_runs)
 		$num_tests++;
 	}
 
+	if ($pgdump_runs{$run}->{secondary_dump_cmd})
+	{
+		$num_tests++;
+	}
+
 	if ($pgdump_runs{$run}->{test_key})
 	{
 		$test_key = $pgdump_runs{$run}->{test_key};
@@ -3499,12 +3571,23 @@ foreach my $run (sort keys %pgdump_runs)
 	$node->command_ok(\@{ $pgdump_runs{$run}->{dump_cmd} },
 		"$run: pg_dump runs");
 
+	if ($pgdump_runs{$run}->{secondary_dump_cmd})
+	{
+		$node->safe_psql('postgres', 'create database secondary_dump;');
+	}
+
 	if ($pgdump_runs{$run}->{restore_cmd})
 	{
 		$node->command_ok(\@{ $pgdump_runs{$run}->{restore_cmd} },
 			"$run: pg_restore runs");
 	}
 
+	if ($pgdump_runs{$run}->{secondary_dump_cmd})
+	{
+		$node->command_ok(\@{ $pgdump_runs{$run}->{secondary_dump_cmd} },
+			"$run: secondary pg_dump runs");
+	}
+
 	if ($pgdump_runs{$run}->{test_key})
 	{
 		$test_key = $pgdump_runs{$run}->{test_key};
@@ -3561,6 +3644,14 @@ foreach my $run (sort keys %pgdump_runs)
 			}
 		}
 	}
+
+	if ($pgdump_runs{$run}->{secondary_dump_cmd})
+	{
+		$node->safe_psql('secondary_dump',
+			'alter subscription sub1 set (slot_name = NONE);');
+		$node->safe_psql('secondary_dump', 'drop subscription sub1;');
+		$node->safe_psql('postgres',       'drop database secondary_dump;');
+	}
 }
 
 #########################################
-- 
2.26.2

Reply via email to