Re: [HACKERS] Re: [COMMITTERS] pgsql: Remove pgbench "progress" test pending solution of its timing is (fwd)

Fabien COELHO Sat, 24 Nov 2018 08:58:58 -0800

Unfortunately, there was no activity over the last few commitfests and the
proposed patch pgbench-tap-progress-6 can't be applied anymore without
conflicts. Fabien, what are your plans about it, could you please post a
rebased version?


Here it is.

--
Fabien.

diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index c64e16187a..4495395990 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -5736,6 +5736,96 @@ main(int argc, char **argv)
 	return exit_code;
 }
 
+/* display the progress report */
+static void
+doProgressReport(TState *thread, StatsData *plast, int64 *plast_report,
+				 int64 now, int64 thread_start)
+{
+	StatsData	cur;
+	int64		run = now - *plast_report,
+				ntx;
+	double		tps,
+				total_run,
+				latency,
+				sqlat,
+				lag,
+				stdev;
+	char		tbuf[64];
+	int		i;
+
+	/*
+	 * Add up the statistics of all threads.
+	 *
+	 * XXX: No locking. There is no guarantee that we get an
+	 * atomic snapshot of the transaction count and latencies, so
+	 * these figures can well be off by a small amount. The
+	 * progress report's purpose is to give a quick overview of
+	 * how the test is going, so that shouldn't matter too much.
+	 * (If a read from a 64-bit integer is not atomic, you might
+	 * get a "torn" read and completely bogus latencies though!)
+	 */
+	initStats(&cur, 0);
+	for (i = 0; i < nthreads; i++)
+	{
+		mergeSimpleStats(&cur.latency, &thread[i].stats.latency);
+		mergeSimpleStats(&cur.lag, &thread[i].stats.lag);
+		cur.cnt += thread[i].stats.cnt;
+		cur.skipped += thread[i].stats.skipped;
+	}
+
+	/* we count only actually executed transactions */
+	ntx = (cur.cnt - cur.skipped) - (plast->cnt - plast->skipped);
+	total_run = (now - thread_start) / 1000000.0;
+	tps = 1000000.0 * ntx / run;
+	if (ntx > 0)
+	{
+		latency = 0.001 * (cur.latency.sum - plast->latency.sum) / ntx;
+		sqlat = 1.0 * (cur.latency.sum2 - plast->latency.sum2) / ntx;
+		stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
+		lag = 0.001 * (cur.lag.sum - plast->lag.sum) / ntx;
+	}
+	else
+	{
+		latency = sqlat = stdev = lag = 0;
+	}
+
+	if (progress_timestamp)
+	{
+		/*
+		 * On some platforms the current system timestamp is
+		 * available in now_time, but rather than get entangled
+		 * with that, we just eat the cost of an extra syscall in
+		 * all cases.
+		 */
+		struct timeval tv;
+
+		gettimeofday(&tv, NULL);
+		snprintf(tbuf, sizeof(tbuf), "%ld.%03ld s",
+				 (long) tv.tv_sec, (long) (tv.tv_usec / 1000));
+	}
+	else
+	{
+		/* round seconds are expected, but the thread may be late */
+		snprintf(tbuf, sizeof(tbuf), "%.1f s", total_run);
+	}
+
+	fprintf(stderr,
+			"progress: %s, %.1f tps, lat %.3f ms stddev %.3f",
+			tbuf, tps, latency, stdev);
+
+	if (throttle_delay)
+	{
+		fprintf(stderr, ", lag %.3f ms", lag);
+		if (latency_limit)
+			fprintf(stderr, ", " INT64_FORMAT " skipped",
+					cur.skipped - plast->skipped);
+	}
+	fprintf(stderr, "\n");
+
+	*plast = cur;
+	*plast_report = now;
+}
+
 static void *
 threadRun(void *arg)
 {
@@ -5746,6 +5836,7 @@ threadRun(void *arg)
 	int			nstate = thread->nstate;
 	int			remains = nstate;	/* number of remaining clients */
 	socket_set *sockets = alloc_socket_set(nstate);
+	int			aborted = 0;
 	int			i;
 
 	/* for reporting progress: */
@@ -5975,6 +6066,10 @@ threadRun(void *arg)
 			 */
 			if (st->state == CSTATE_FINISHED || st->state == CSTATE_ABORTED)
 				remains--;
+
+			/* count aborted clients */
+			if (st->state == CSTATE_ABORTED)
+				aborted++;
 		}
 
 		/* progress report is made by thread 0 for all threads */
@@ -5987,93 +6082,15 @@ threadRun(void *arg)
 			now = INSTR_TIME_GET_MICROSEC(now_time);
 			if (now >= next_report)
 			{
-				/* generate and show report */
-				StatsData	cur;
-				int64		run = now - last_report,
-							ntx;
-				double		tps,
-							total_run,
-							latency,
-							sqlat,
-							lag,
-							stdev;
-				char		tbuf[315];
-
-				/*
-				 * Add up the statistics of all threads.
-				 *
-				 * XXX: No locking. There is no guarantee that we get an
-				 * atomic snapshot of the transaction count and latencies, so
-				 * these figures can well be off by a small amount. The
-				 * progress report's purpose is to give a quick overview of
-				 * how the test is going, so that shouldn't matter too much.
-				 * (If a read from a 64-bit integer is not atomic, you might
-				 * get a "torn" read and completely bogus latencies though!)
-				 */
-				initStats(&cur, 0);
-				for (i = 0; i < nthreads; i++)
-				{
-					mergeSimpleStats(&cur.latency, &thread[i].stats.latency);
-					mergeSimpleStats(&cur.lag, &thread[i].stats.lag);
-					cur.cnt += thread[i].stats.cnt;
-					cur.skipped += thread[i].stats.skipped;
-				}
-
-				/* we count only actually executed transactions */
-				ntx = (cur.cnt - cur.skipped) - (last.cnt - last.skipped);
-				total_run = (now - thread_start) / 1000000.0;
-				tps = 1000000.0 * ntx / run;
-				if (ntx > 0)
-				{
-					latency = 0.001 * (cur.latency.sum - last.latency.sum) / ntx;
-					sqlat = 1.0 * (cur.latency.sum2 - last.latency.sum2) / ntx;
-					stdev = 0.001 * sqrt(sqlat - 1000000.0 * latency * latency);
-					lag = 0.001 * (cur.lag.sum - last.lag.sum) / ntx;
-				}
-				else
-				{
-					latency = sqlat = stdev = lag = 0;
-				}
-
-				if (progress_timestamp)
-				{
-					/*
-					 * On some platforms the current system timestamp is
-					 * available in now_time, but rather than get entangled
-					 * with that, we just eat the cost of an extra syscall in
-					 * all cases.
-					 */
-					struct timeval tv;
-
-					gettimeofday(&tv, NULL);
-					snprintf(tbuf, sizeof(tbuf), "%ld.%03ld s",
-							 (long) tv.tv_sec, (long) (tv.tv_usec / 1000));
-				}
-				else
-				{
-					/* round seconds are expected, but the thread may be late */
-					snprintf(tbuf, sizeof(tbuf), "%.1f s", total_run);
-				}
-
-				fprintf(stderr,
-						"progress: %s, %.1f tps, lat %.3f ms stddev %.3f",
-						tbuf, tps, latency, stdev);
-
-				if (throttle_delay)
-				{
-					fprintf(stderr, ", lag %.3f ms", lag);
-					if (latency_limit)
-						fprintf(stderr, ", " INT64_FORMAT " skipped",
-								cur.skipped - last.skipped);
-				}
-				fprintf(stderr, "\n");
-
-				last = cur;
-				last_report = now;
+				doProgressReport(thread, &last,
+								 &last_report, now, thread_start);
 
 				/*
 				 * Ensure that the next report is in the future, in case
-				 * pgbench/postgres got stuck somewhere.
+				 * pgbench/postgres got stuck somewhere. This may skip
+				 * some progress reports if the thread does not get enough
+				 * cpu time. In such case, probably the whole bench should
+				 * be ignored.
 				 */
 				do
 				{
@@ -6084,6 +6101,51 @@ threadRun(void *arg)
 	}
 
 done:
+	/*
+	 * Under -R, comply with -T and -P even if there is nothing to do,
+	 * (unless all clients aborted) and ensure that one report is printed.
+	 * This special behavior allows tap tests to check that a run lasts
+	 * as expected and that some progress is shown, even on very slow hosts.
+	 */
+	if (duration && throttle_delay && aborted < nstate && thread->tid == 0)
+	{
+		int64		thread_end = thread_start + (int64) duration * 1000000;
+		instr_time	now_time;
+		int64		now;
+
+		INSTR_TIME_SET_CURRENT(now_time);
+		now = INSTR_TIME_GET_MICROSEC(now_time);
+
+		while (now < thread_end)
+		{
+			if (progress && next_report <= thread_end)
+			{
+				pg_usleep(next_report - now);
+				INSTR_TIME_SET_CURRENT(now_time);
+				now = INSTR_TIME_GET_MICROSEC(now_time);
+				doProgressReport(thread, &last, &last_report, now, thread_start);
+				/* schedule next report */
+				do
+				{
+					next_report += (int64) progress * 1000000;
+				} while (now >= next_report);
+			}
+			else
+			{
+				pg_usleep(thread_end - now);
+				INSTR_TIME_SET_CURRENT(now_time);
+				now = INSTR_TIME_GET_MICROSEC(now_time);
+			}
+		}
+
+		/*
+		 * Print a closing progress report if none were printed
+		 * and at least one was expected.
+		 */
+		if (progress && progress <= duration && last_report == thread_start)
+			doProgressReport(thread, &last, &last_report, now, thread_start);
+	}
+
 	INSTR_TIME_SET_CURRENT(start);
 	disconnect_all(state, nstate);
 	INSTR_TIME_SET_CURRENT(end);
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index 89678e7f3f..f02bd66855 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -4,13 +4,14 @@ use warnings;
 use PostgresNode;
 use TestLib;
 use Test::More;
+use Time::HiRes qw{time};
 
 # start a pgbench specific server
 my $node = get_new_node('main');
 $node->init;
 $node->start;
 
-# invoke pgbench
+# invoke pgbench, return elapsed time
 sub pgbench
 {
 	local $Test::Builder::Level = $Test::Builder::Level + 1;
@@ -40,12 +41,14 @@ sub pgbench
 			append_to_file($filename, $$files{$fn});
 		}
 	}
+
+	my $t0 = time();
 	$node->command_checks_all(\@cmd, $stat, $out, $err, $name);
 
 	# cleanup?
 	#unlink @filenames or die "cannot unlink files (@filenames): $!";
 
-	return;
+	return time() - $t0;
 }
 
 # Test concurrent insertion into table with serial column.  This
@@ -837,7 +840,44 @@ sub check_pgbench_logs
 
 my $bdir = $node->basedir;
 
-# with sampling rate
+# The point of this test is coverage of various time-related features
+# (-T, -P, --aggregate-interval, --rate, --latency-limit...), so it is
+# somehow time sensitive.
+# The checks performed are quite loose so as to still pass even under
+# degraded (high load, slow host, valgrind run) testing conditions.
+# The main point is to provide coverage, most of the time.
+my $elapsed = pgbench(
+	"-T 2 -P 1 -l --log-prefix=$bdir/001_pgbench_log_1 --aggregate-interval=1"
+	  . ' -S -b se@2 --rate=20 --latency-limit=1000 -j ' . $nthreads
+	  . ' -c 3 -r',
+	0,
+	[   qr{type: multiple},
+		qr{clients: 3},
+		qr{threads: $nthreads},
+		# the shown duration is really -T argument value
+		qr{duration: 2 s},
+		qr{script 1: .* select only},
+		qr{script 2: .* select only},
+		qr{statement latencies in milliseconds},
+		qr{FROM pgbench_accounts} ],
+	[	qr{vacuum},
+		qr{progress: \d\b} ],
+	'pgbench progress');
+
+# this depends on pgbench to avoid a shortcut under --rate
+# if there is nothing to do.
+ok($elapsed >= 2.0, "-T 2 leads to at least 2 second run");
+
+# if the test has gone AWOL, coldly skip these detailed checks.
+if (abs($elapsed - 2.0) < 0.5)
+{
+	# $nthreads threads, 2 seconds, but due to timing imprecision we might get
+	# only 1 or as many as 3 progress reports per thread.
+	check_pgbench_logs("$bdir/001_pgbench_log_1", $nthreads, 1, 3,
+		qr{^\d+ \d{1,2} \d+ \d+ \d+ \d+ \d+ \d+ \d+ \d+ \d+$});
+}
+
+# with sampling rate, not time sensitive
 pgbench(
 	"-n -S -t 50 -c 2 --log --log-prefix=$bdir/001_pgbench_log_2 --sampling-rate=0.5",
 	0,
@@ -845,6 +885,8 @@ pgbench(
 	[qr{^$}],
 	'pgbench logs');
 
+# random 50% of 2*50 transactions, accept between 8 and 92
+# probability of failure is about 2 * 2^-42 (?)
 check_pgbench_logs("$bdir/001_pgbench_log_2", 1, 8, 92,
 	qr{^0 \d{1,2} \d+ \d \d+ \d+$});

Re: [HACKERS] Re: [COMMITTERS] pgsql: Remove pgbench "progress" test pending solution of its timing is (fwd)

Reply via email to