On Wed Jul 19, 2023 at 10:07 PM CDT, Michael Paquier wrote:
So this patch causes pgbench to not stick with its historical
behavior, and the change is incompatible with the comments because the
tellers and branches tables don't use NULL for their filler attribute
anymore.

Great find. This was a problem of me just not understanding the COPY command properly. Relevant documentation snippet:

Specifies the string that represents a null value. The default is \N (backslash-N) in text format, and an unquoted empty string in CSV format. You might prefer an empty string even in text format for cases where you don't want to distinguish nulls from empty strings. This option is not allowed when using binary format.

This new revision populates the column with the NULL value.

psql (17devel)
Type "help" for help.

tristan957=# select count(1) from pgbench_branches;
count -------
     1
(1 row)

tristan957=# select count(1) from pgbench_branches where filler is null;
count -------
     1
(1 row)

Thanks for your testing Michael. I went ahead and added a test to make sure that this behavior doesn't regress accidentally, but I am struggling to get the test to fail using the previous version of this patch. Do you have any advice? This is my first time writing a test for Postgres. I can recreate the issue outside of the test script, but not within it for whatever reason.

--
Tristan Partin
Neon (https://neon.tech)
From cf84b3ea2d7b583aaee3f80807b26ef4521db0f6 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tris...@neon.tech>
Date: Tue, 23 May 2023 11:48:16 -0500
Subject: [PATCH v6] Use COPY instead of INSERT for populating all tables

COPY is a better interface for bulk loading and/or high latency
connections. Previously COPY was only used for the pgbench_accounts
table because the amount of data was so much larger than the other
tables. However, as already said there are also gains to be had in the
high latency case, such as loading data across continents.
---
 doc/src/sgml/ref/pgbench.sgml                |   8 +-
 src/bin/pgbench/pgbench.c                    | 151 +++++++++++--------
 src/bin/pgbench/t/001_pgbench_with_server.pl |  18 +++
 3 files changed, 110 insertions(+), 67 deletions(-)

diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 850028557d..4424595cc6 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -231,10 +231,10 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
             extensively through a <command>COPY</command>.
             <command>pgbench</command> uses the FREEZE option with version 14 or later
             of <productname>PostgreSQL</productname> to speed up
-            subsequent <command>VACUUM</command>, unless partitions are enabled.
-            Using <literal>g</literal> causes logging to print one message
-            every 100,000 rows while generating data for the
-            <structname>pgbench_accounts</structname> table.
+            subsequent <command>VACUUM</command>. If partitions are enabled for
+            the <literal>pgbench_accounts</literal> table, the FREEZE option is
+            not enabled. Using <literal>g</literal> causes logging to print one
+            message every 100,000 rows while generating data for all tables.
            </para>
            <para>
             With <literal>G</literal> (server-side data generation),
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 087baa7d57..ef01d049a1 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -4859,17 +4859,44 @@ initTruncateTables(PGconn *con)
 					 "pgbench_tellers");
 }
 
-/*
- * Fill the standard tables with some data generated and sent from the client
- */
+typedef void initRow(PQExpBufferData *sql, int64 curr);
+
 static void
-initGenerateDataClientSide(PGconn *con)
+initBranch(PQExpBufferData *sql, int64 curr)
+{
+	printfPQExpBuffer(sql,
+					  INT64_FORMAT "\t0\t\\N\n",
+					  curr + 1);
+}
+
+static void
+initTeller(PQExpBufferData *sql, int64 curr)
+{
+	printfPQExpBuffer(sql,
+					  INT64_FORMAT "\t" INT64_FORMAT "\t0\t\\N\n",
+					  curr + 1, curr / ntellers + 1);
+}
+
+static void
+initAccount(PQExpBufferData *sql, int64 curr)
 {
+	/* "filler" column defaults to blank padded empty string */
+	printfPQExpBuffer(sql,
+					  INT64_FORMAT "\t" INT64_FORMAT "\t0\t\n",
+					  curr + 1, curr / naccounts + 1);
+}
+
+static void
+initPopulateTable(PGconn *con, const char *table, int64 base, initRow *init_row)
+{
+	int			n;
+	int			k;
+	int			chars = 0;
+	PGresult	*res;
 	PQExpBufferData sql;
-	PGresult   *res;
-	int			i;
-	int64		k;
-	char	   *copy_statement;
+	char 		copy_statement[256];
+	const char *copy_statement_fmt = "copy %s from stdin";
+	const int64 total = base * scale;
 
 	/* used to track elapsed time and estimate of the remaining time */
 	pg_time_usec_t start;
@@ -4878,50 +4905,24 @@ initGenerateDataClientSide(PGconn *con)
 	/* Stay on the same line if reporting to a terminal */
 	char		eol = isatty(fileno(stderr)) ? '\r' : '\n';
 
-	fprintf(stderr, "generating data (client-side)...\n");
-
-	/*
-	 * we do all of this in one transaction to enable the backend's
-	 * data-loading optimizations
-	 */
-	executeStatement(con, "begin");
-
-	/* truncate away any old data */
-	initTruncateTables(con);
-
 	initPQExpBuffer(&sql);
 
 	/*
-	 * fill branches, tellers, accounts in that order in case foreign keys
-	 * already exist
+	 * Use COPY with FREEZE on v14 and later without partitioning.  Remember
+	 * that partitions only applies to pgbench_accounts.
 	 */
-	for (i = 0; i < nbranches * scale; i++)
-	{
-		/* "filler" column defaults to NULL */
-		printfPQExpBuffer(&sql,
-						  "insert into pgbench_branches(bid,bbalance) values(%d,0)",
-						  i + 1);
-		executeStatement(con, sql.data);
-	}
+	if (PQserverVersion(con) >= 140000) {
+		const bool is_pgbench_accounts = strcmp(table, "pgbench_accounts") == 0;
 
-	for (i = 0; i < ntellers * scale; i++)
-	{
-		/* "filler" column defaults to NULL */
-		printfPQExpBuffer(&sql,
-						  "insert into pgbench_tellers(tid,bid,tbalance) values (%d,%d,0)",
-						  i + 1, i / ntellers + 1);
-		executeStatement(con, sql.data);
+		if (!is_pgbench_accounts || partitions == 0)
+			copy_statement_fmt = "copy %s from stdin with (freeze on)";
 	}
 
-	/*
-	 * accounts is big enough to be worth using COPY and tracking runtime
-	 */
-
-	/* use COPY with FREEZE on v14 and later without partitioning */
-	if (partitions == 0 && PQserverVersion(con) >= 140000)
-		copy_statement = "copy pgbench_accounts from stdin with (freeze on)";
-	else
-		copy_statement = "copy pgbench_accounts from stdin";
+	n = pg_snprintf(copy_statement, sizeof(copy_statement), copy_statement_fmt, table);
+	if (n >= sizeof(copy_statement))
+		pg_fatal("invalid buffer size: must be at least %d characters long", n);
+	else if (n == -1)
+		pg_fatal("invalid format string");
 
 	res = PQexec(con, copy_statement);
 
@@ -4931,14 +4932,11 @@ initGenerateDataClientSide(PGconn *con)
 
 	start = pg_time_now();
 
-	for (k = 0; k < (int64) naccounts * scale; k++)
+	for (k = 0; k < total; k++)
 	{
-		int64		j = k + 1;
+		const int64		j = k + 1;
 
-		/* "filler" column defaults to blank padded empty string */
-		printfPQExpBuffer(&sql,
-						  INT64_FORMAT "\t" INT64_FORMAT "\t0\t\n",
-						  j, k / naccounts + 1);
+		init_row(&sql, k);
 		if (PQputline(con, sql.data))
 			pg_fatal("PQputline failed");
 
@@ -4952,25 +4950,26 @@ initGenerateDataClientSide(PGconn *con)
 		if ((!use_quiet) && (j % 100000 == 0))
 		{
 			double		elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start);
-			double		remaining_sec = ((double) scale * naccounts - j) * elapsed_sec / j;
+			double		remaining_sec = ((double) total - j) * elapsed_sec / j;
 
-			fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s)%c",
-					j, (int64) naccounts * scale,
-					(int) (((int64) j * 100) / (naccounts * (int64) scale)),
-					elapsed_sec, remaining_sec, eol);
+			chars = fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) of %s done (elapsed %.2f s, remaining %.2f s)%c",
+					j, total,
+					(int) ((j * 100) / total),
+					table, elapsed_sec, remaining_sec, eol);
 		}
 		/* let's not call the timing for each row, but only each 100 rows */
 		else if (use_quiet && (j % 100 == 0))
 		{
 			double		elapsed_sec = PG_TIME_GET_DOUBLE(pg_time_now() - start);
-			double		remaining_sec = ((double) scale * naccounts - j) * elapsed_sec / j;
+			double		remaining_sec = ((double) total - j) * elapsed_sec / j;
 
 			/* have we reached the next interval (or end)? */
-			if ((j == scale * naccounts) || (elapsed_sec >= log_interval * LOG_STEP_SECONDS))
+			if ((j == total) || (elapsed_sec >= log_interval * LOG_STEP_SECONDS))
 			{
-				fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) done (elapsed %.2f s, remaining %.2f s)%c",
-						j, (int64) naccounts * scale,
-						(int) (((int64) j * 100) / (naccounts * (int64) scale)), elapsed_sec, remaining_sec, eol);
+				chars = fprintf(stderr, INT64_FORMAT " of " INT64_FORMAT " tuples (%d%%) of %s done (elapsed %.2f s, remaining %.2f s)%c",
+						j, total,
+						(int) ((j * 100) / total),
+						table, elapsed_sec, remaining_sec, eol);
 
 				/* skip to the next interval */
 				log_interval = (int) ceil(elapsed_sec / LOG_STEP_SECONDS);
@@ -4978,8 +4977,8 @@ initGenerateDataClientSide(PGconn *con)
 		}
 	}
 
-	if (eol != '\n')
-		fputc('\n', stderr);	/* Need to move to next line */
+	if (chars != 0 && eol != '\n')
+		fprintf(stderr, "%*c\r", chars - 1, ' '); /* Clear the current line */
 
 	if (PQputline(con, "\\.\n"))
 		pg_fatal("very last PQputline failed");
@@ -4987,6 +4986,32 @@ initGenerateDataClientSide(PGconn *con)
 		pg_fatal("PQendcopy failed");
 
 	termPQExpBuffer(&sql);
+}
+
+/*
+ * Fill the standard tables with some data generated and sent from the client
+ */
+static void
+initGenerateDataClientSide(PGconn *con)
+{
+	fprintf(stderr, "generating data (client-side)...\n");
+
+	/*
+	 * we do all of this in one transaction to enable the backend's
+	 * data-loading optimizations
+	 */
+	executeStatement(con, "begin");
+
+	/* truncate away any old data */
+	initTruncateTables(con);
+
+	/*
+	 * fill branches, tellers, accounts in that order in case foreign keys
+	 * already exist
+	 */
+	initPopulateTable(con, "pgbench_branches", nbranches, initBranch);
+	initPopulateTable(con, "pgbench_tellers", ntellers, initTeller);
+	initPopulateTable(con, "pgbench_accounts", naccounts, initAccount);
 
 	executeStatement(con, "commit");
 }
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index f8ca8a922d..d62cf87703 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -67,6 +67,24 @@ $node->pgbench(
 	],
 	'pgbench scale 1 initialization',);
 
+# Check that the pgbench_branches and pgbench_tellers filler columns are filled
+# with NULLs
+foreach my $table ('pgbench_branches', 'pgbench_tellers') {
+	my ($ret, $out, $err) = $node->psql(
+		'postgres',
+		"SELECT COUNT(1) FROM $table;
+		 SELECT COUNT(1) FROM $table WHERE filler is NULL;",
+		extra_params => ['--no-align', '--tuples-only']);
+
+	is($ret, 0, "psql $table counts status is 0");
+	is($err, '', "psql $table counts stderr is empty");
+	if ($out =~ m/^(\d+)\n(\d+)$/g) {
+		is($1, $2, "psql $table filler column filled with NULLs");
+	} else {
+		fail("psql $table stdout m/^(\\d+)\n(\\d+)$/g");
+	}
+}
+
 # Again, with all possible options
 $node->pgbench(
 	'--initialize --init-steps=dtpvg --scale=1 --unlogged-tables --fillfactor=98 --foreign-keys --quiet --tablespace=regress_pgbench_tap_1_ts --index-tablespace=regress_pgbench_tap_1_ts --partitions=2 --partition-method=hash',
-- 
Tristan Partin
Neon (https://neon.tech)

Reply via email to