Now the overhead is really 60-65%. Although the specification is unambiguous, but we still need some maths to know whether it fits in buffers or memory... The point of Karel regression is to take this into account.

Also, whether this option would be more admissible to Tom is still an open question. Tom?

Here is a version with this approach: the documentation talks about "actual data size, without overheads", and points out that storage overheads are typically an additional 65%.

--
Fabien.
diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 3dd492c..ba6174f 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -49,7 +49,7 @@
 
 <screen>
 transaction type: &lt;builtin: TPC-B (sort of)&gt;
-scaling factor: 10
+scaling factor: 10 (95.4 MiB of actual data)
 query mode: simple
 number of clients: 10
 number of threads: 1
@@ -282,6 +282,20 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
         in order to be big enough to hold the range of account
         identifiers.
        </para>
+
+       <para>
+        The scale can also be specified as an expected actual data size in
+        the accounts, tellers and branches tables, without overheads such
+        as headers and indexes, by specifying a unit.
+        For instance, <literal>-s 5G</literal> will approximate the scale
+        required for 5 GiB of data.
+        Note that storage overheads typically represent about 65\% of the actual data,
+        and may vary depending on options.
+        Allowed units are IEC 1024 powers (<literal>KiB MiB GiB TiB PiB</literal>),
+        SI 1000 powers (<literal>kB MB GB TB PB</literal>) and for convenience
+        simple size prefixes <literal>K M G T P</literal> are aliases for the IEC
+        binary sizes.
+       </para>
       </listitem>
      </varlistentry>
 
@@ -1600,7 +1614,7 @@ END;
 <screen>
 starting vacuum...end.
 transaction type: &lt;builtin: TPC-B (sort of)&gt;
-scaling factor: 1
+scaling factor: 1 (9.5 MiB of actual data)
 query mode: simple
 number of clients: 10
 number of threads: 1
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index d420942..a8745d3 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -524,7 +524,7 @@ usage(void)
 		   "  -F, --fillfactor=NUM     set fill factor\n"
 		   "  -n, --no-vacuum          do not run VACUUM during initialization\n"
 		   "  -q, --quiet              quiet logging (one message each 5 seconds)\n"
-		   "  -s, --scale=NUM          scaling factor\n"
+		   "  -s, --scale=NUM|SIZE     scaling factor or expected actual data (without overheads) size\n"
 		   "  --foreign-keys           create foreign key constraints between tables\n"
 		   "  --index-tablespace=TABLESPACE\n"
 		   "                           create indexes in the specified tablespace\n"
@@ -552,7 +552,7 @@ usage(void)
 		   "  -P, --progress=NUM       show thread progress report every NUM seconds\n"
 		   "  -r, --report-latencies   report average latency per command\n"
 		   "  -R, --rate=NUM           target rate in transactions per second\n"
-		   "  -s, --scale=NUM          report this scale factor in output\n"
+		   "  -s, --scale=NUM|SIZE     report this scale factor in output\n"
 		   "  -t, --transactions=NUM   number of transactions each client runs (default: 10)\n"
 		   "  -T, --time=NUM           duration of benchmark test in seconds\n"
 		   "  -v, --vacuum-all         vacuum all four standard tables before tests\n"
@@ -668,6 +668,97 @@ gotdigits:
 	return ((sign < 0) ? -result : result);
 }
 
+/* return a size in bytes, or exit with an error message
+ */
+static int64
+parse_size(char * s, const char * error_message)
+{
+	static struct { char *name; int64 multiplier; }
+		UNITS[17] = {
+			/* IEC units */
+			{ "KiB", 1024 },
+			{ "MiB", 1024 * 1024 },
+			{ "GiB", 1024 * 1024 * 1024 },
+			{ "TiB", (int64) 1024 * 1024 * 1024 * 1024 },
+			{ "PiB", (int64) 1024 * 1024 * 1024 * 1024 * 1024 },
+			/* SI units */
+			{ "kB", 1000 },
+			{ "MB", 1000 * 1000 },
+			{ "GB", 1000 * 1000 * 1000 },
+			{ "TB", (int64) 1000 * 1000 * 1000 * 1000 },
+			{ "PB", (int64) 1000 * 1000 * 1000 * 1000 * 1000 },
+			/* common/convenient JEDEC usage */
+			{ "KB", 1024 },
+			{ "K", 1024 },
+			{ "M", 1024 * 1024 },
+			{ "G", 1024 * 1024 * 1024 },
+			{ "T", (int64) 1024 * 1024 * 1024 * 1024 },
+			{ "P", (int64) 1024 * 1024 * 1024 * 1024 * 1024 },
+			/* unit */
+			{ "B", 1 },
+	};
+
+	int		len = strlen(s), last = -1, i;
+	int64	size;
+	char	clast;
+
+	/* look for the unit */
+	for (i = 0; i < lengthof(UNITS); i++)
+		if (strcmp(s + len - strlen(UNITS[i].name), UNITS[i].name) == 0)
+			break;
+
+	/* found, or not */
+	if (i < lengthof(UNITS))
+	{
+		last = len - strlen(UNITS[i].name);
+		clast = s[last];
+		s[last] = '\0';
+	}
+	else /* assume bytes */
+		i = lengthof(UNITS) - 1;
+
+	if (!is_an_int(s))
+	{
+		fprintf(stderr, "invalid %s: \"%s\"\n", error_message, s);
+		exit(1);
+	}
+
+	size = strtoint64(s) * UNITS[i].multiplier;
+
+	if (last != -1)
+		s[last] = clast;
+
+	return size;
+}
+
+/* parse scale, returning at least 1 */
+static int
+parse_scale(char * s)
+{
+	int scale;
+
+	if (is_an_int(s))
+	{
+		/* standard scaling */
+		scale = atoi(s);
+	}
+	else
+	{
+		/* try data size scaling */
+		int64 size = parse_size(s, "scaling factor");
+		/* size refers to actual data, without overheads */
+		scale = (int) ceil(size / ((naccounts + ntellers + nbranches) * 100.0));
+	}
+
+	if (scale <= 0)
+	{
+		fprintf(stderr, "scale %s too small, rounded to 1\n", s);
+		scale = 1;
+	}
+
+	return scale;
+}
+
 /* random number generator: uniform distribution from min to max inclusive */
 static int64
 getrand(TState *thread, int64 min, int64 max)
@@ -4244,7 +4335,9 @@ printResults(TState *threads, StatsData *total, instr_time total_time,
 	/* Report test parameters. */
 	printf("transaction type: %s\n",
 		   num_scripts == 1 ? sql_script[0].desc : "multiple scripts");
-	printf("scaling factor: %d\n", scale);
+	/* scale to MiB evaluation must be consistent with parse_scale */
+	printf("scaling factor: %d (%.1f MiB of actual data)\n",
+		   scale, (naccounts + ntellers + nbranches) * 100.0 * scale / (1024 * 1024));
 	printf("query mode: %s\n", QUERYMODE[querymode]);
 	printf("number of clients: %d\n", nclients);
 	printf("number of threads: %d\n", nthreads);
@@ -4560,12 +4653,7 @@ main(int argc, char **argv)
 				break;
 			case 's':
 				scale_given = true;
-				scale = atoi(optarg);
-				if (scale <= 0)
-				{
-					fprintf(stderr, "invalid scaling factor: \"%s\"\n", optarg);
-					exit(1);
-				}
+				scale = parse_scale(optarg);
 				break;
 			case 't':
 				benchmarking_option_set = true;
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index 99286f6..d64f55d 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -81,7 +81,7 @@ pgbench(
 
 # Again, with all possible options
 pgbench(
-'--initialize --init-steps=dtpvg --scale=1 --unlogged-tables --fillfactor=98 --foreign-keys --quiet --tablespace=pg_default --index-tablespace=pg_default',
+'--initialize --init-steps=dtpvg --scale=18M --unlogged-tables --fillfactor=98 --foreign-keys --quiet --tablespace=pg_default --index-tablespace=pg_default',
 	0,
 	[qr{^$}i],
 	[   qr{dropping old tables},
@@ -89,6 +89,7 @@ pgbench(
 		qr{vacuuming},
 		qr{creating primary keys},
 		qr{creating foreign keys},
+		qr{200000 of 200000 tuples}, # scale 2
 		qr{done\.} ],
 	'pgbench scale 1 initialization');
 
diff --git a/src/bin/pgbench/t/002_pgbench_no_server.pl b/src/bin/pgbench/t/002_pgbench_no_server.pl
index 6ea55f8..52d135c 100644
--- a/src/bin/pgbench/t/002_pgbench_no_server.pl
+++ b/src/bin/pgbench/t/002_pgbench_no_server.pl
@@ -44,6 +44,7 @@ my @options = (
 	[   'bad #threads', '-j eleven', [qr{invalid number of threads: "eleven"}]
 	],
 	[ 'bad scale', '-i -s two', [qr{invalid scaling factor: "two"}] ],
+	[ 'bad scale size', '-i -s 2stuff', [qr{invalid scaling factor: "2stuff"}] ],
 	[   'invalid #transactions',
 		'-t zil',
 		[qr{invalid number of transactions: "zil"}] ],

Reply via email to