Re: pgbench - add pseudo-random permutation function

Fabien COELHO Thu, 01 Apr 2021 22:44:12 -0700

See attached v27 proposal.


As usual, it is easier to see with the actual attachement:-)

--
Fabien.

diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 50cf22ba6b..84d9566f49 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -1057,7 +1057,7 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
 
       <row>
        <entry> <literal>default_seed</literal> </entry>
-       <entry>seed used in hash functions by default</entry>
+       <entry>seed used in hash and pseudorandom permutation functions by default</entry>
       </row>
 
       <row>
@@ -1864,6 +1864,24 @@ SELECT 4 AS four \; SELECT 5 AS five \aset
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <function>permute</function> ( <parameter>i</parameter>, <parameter>size</parameter> [, <parameter>seed</parameter> ] )
+        <returnvalue>integer</returnvalue>
+       </para>
+       <para>
+        Permuted value of <parameter>i</parameter>, in the range
+        <literal>[0, size)</literal>.  This is the new position of
+        <parameter>i</parameter> (modulo <parameter>size</parameter>) in a
+        pseudorandom permutation of the integers <literal>0...size-1</literal>,
+        parameterized by <parameter>seed</parameter>.
+       </para>
+       <para>
+        <literal>permute(0, 4)</literal>
+        <returnvalue>an integer between 0 and 3</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <function>pi</function> ()
@@ -2071,29 +2089,70 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
     </listitem>
    </itemizedlist>
 
+   <note>
+    <para>
+      When designing a benchmark which selects rows non-uniformly, be aware
+      that the rows chosen may be correlated with other data such as IDs from
+      a sequence or the physical row ordering, which may skew performance
+      measurements.
+    </para>
+    <para>
+      To avoid this, you may wish to use the <function>permute</function>
+      function, or some other additional step with similar effect, to shuffle
+      the selected rows and remove such correlations.
+    </para>
+   </note>
+
   <para>
     Hash functions <literal>hash</literal>, <literal>hash_murmur2</literal> and
     <literal>hash_fnv1a</literal> accept an input value and an optional seed parameter.
     In case the seed isn't provided the value of <literal>:default_seed</literal>
     is used, which is initialized randomly unless set by the command-line
-    <literal>-D</literal> option. Hash functions can be used to scatter the
-    distribution of random functions such as <literal>random_zipfian</literal> or
-    <literal>random_exponential</literal>. For instance, the following pgbench
-    script simulates possible real world workload typical for social media and
-    blogging platforms where few accounts generate excessive load:
+    <literal>-D</literal> option.
+  </para>
+
+  <para>
+    <literal>permute</literal> accepts an input value, a size, and an optional
+    seed parameter.  It generates a pseudorandom permutation of integers in
+    the range <literal>[0, size)</literal>, and returns the index of the input
+    value in the permuted values.  The permutation chosen is parameterized by
+    the seed, which defaults to <literal>:default_seed</literal>, if not
+    specified.  Unlike the hash functions, <literal>permute</literal> ensures
+    that there are no collisions or holes in the output values.  Input values
+    outside the interval are interpreted modulo the size.  The function raises
+    an error if the size is not positive.  <function>permute</function> can be
+    used to scatter the distribution of non-uniform random functions such as
+    <literal>random_zipfian</literal> or <literal>random_exponential</literal>
+    so that values drawn more often are not trivially correlated.  For
+    instance, the following <application>pgbench</application> script
+    simulates a possible real world workload typical for social media and
+    blogging platforms where a few accounts generate excessive load:
 
 <programlisting>
-\set r random_zipfian(0, 100000000, 1.07)
-\set k abs(hash(:r)) % 1000000
+\set size 1000000
+\set r random_zipfian(1, :size, 1.07)
+\set k 1 + permute(:r, :size)
 </programlisting>
 
     In some cases several distinct distributions are needed which don't correlate
-    with each other and this is when implicit seed parameter comes in handy:
+    with each other and this is when the optional seed parameter comes in handy:
 
 <programlisting>
-\set k1 abs(hash(:r, :default_seed + 123)) % 1000000
-\set k2 abs(hash(:r, :default_seed + 321)) % 1000000
+\set k1 1 + permute(:r, :size, :default_seed + 123)
+\set k2 1 + permute(:r, :size, :default_seed + 321)
 </programlisting>
+
+    A similar behavior can also be approximated with <function>hash</function>:
+
+<programlisting>
+\set size 1000000
+\set r random_zipfian(1, 100 * :size, 1.07)
+\set k 1 + abs(hash(:r)) % :size
+</programlisting>
+
+    However, since <function>hash</function> generates collisions, some values
+    will not be reachable and others will be more frequent than expected from
+    the original distribution.
   </para>
 
   <para>
diff --git a/src/bin/pgbench/exprparse.y b/src/bin/pgbench/exprparse.y
index 4d529ea550..56f75ccd25 100644
--- a/src/bin/pgbench/exprparse.y
+++ b/src/bin/pgbench/exprparse.y
@@ -19,6 +19,7 @@
 #define PGBENCH_NARGS_VARIABLE	(-1)
 #define PGBENCH_NARGS_CASE		(-2)
 #define PGBENCH_NARGS_HASH		(-3)
+#define PGBENCH_NARGS_PERMUTE	(-4)
 
 PgBenchExpr *expr_parse_result;
 
@@ -370,6 +371,9 @@ static const struct
 	{
 		"hash_fnv1a", PGBENCH_NARGS_HASH, PGBENCH_HASH_FNV1A
 	},
+	{
+		"permute", PGBENCH_NARGS_PERMUTE, PGBENCH_PERMUTE
+	},
 	/* keep as last array element */
 	{
 		NULL, 0, 0
@@ -482,6 +486,19 @@ make_func(yyscan_t yyscanner, int fnumber, PgBenchExprList *args)
 			}
 			break;
 
+		/* pseudorandom permutation function with optional seed argument */
+		case PGBENCH_NARGS_PERMUTE:
+			if (len < 2 || len > 3)
+				expr_yyerror_more(yyscanner, "unexpected number of arguments",
+								  PGBENCH_FUNCTIONS[fnumber].fname);
+
+			if (len == 2)
+			{
+				PgBenchExpr *var = make_variable("default_seed");
+				args = make_elist(var, args);
+			}
+			break;
+
 		/* common case: positive arguments number */
 		default:
 			Assert(PGBENCH_FUNCTIONS[fnumber].nargs >= 0);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 48ce1712cc..63a4b24e54 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -66,6 +66,7 @@
 #include "getopt_long.h"
 #include "libpq-fe.h"
 #include "pgbench.h"
+#include "port/pg_bitutils.h"
 #include "portability/instr_time.h"
 
 #ifndef M_PI
@@ -1127,6 +1128,120 @@ getHashMurmur2(int64 val, uint64 seed)
 	return (int64) result;
 }
 
+static uint64
+randu64(RandomState * state)
+{
+	uint64 r1 = pg_jrand48((*state).xseed),
+		   r2 = pg_jrand48((*state).xseed);
+	return r1 << 51 | r2 << 13 | r1 >> 13;
+}
+
+
+/*
+ * Pseudorandom permutation function
+ *
+ * For small sizes, this generates each of the (size!) possible permutations
+ * of integers in the range [0, size) with roughly equal probability.  Once
+ * the size is larger than 16, the number of possible permutations exceeds the
+ * number of distinct states of the internal pseudorandom number generator,
+ * and so not all possible permutations can be generated, but the permutations
+ * chosen should continue to give the appearance of being random.
+ *
+ * THIS FUNCTION IS NOT CRYPTOGRAPHICALLY SECURE.
+ * DO NOT USE FOR SUCH PURPOSE.
+ */
+static int64
+permute(const int64 val, const int64 isize, const int64 seed)
+{
+	RandomState random_state1;
+	RandomState random_state2;
+	uint64		size;
+	uint64		v;
+	int			masklen;
+	uint64		mask;
+	int			i;
+
+	if (isize < 2)
+		return 0;				/* nothing to permute */
+
+	/* Initialize a pair of random states using the seed */
+	random_state1.xseed[0] = seed & 0xFFFF;
+	random_state1.xseed[1] = (seed >> 16) & 0xFFFF;
+	random_state1.xseed[2] = (seed >> 32) & 0xFFFF;
+
+	random_state2.xseed[0] = (((uint64) seed) >> 48) & 0xFFFF;
+	random_state2.xseed[1] = seed & 0xFFFF;
+	random_state2.xseed[2] = (seed >> 16) & 0xFFFF;
+
+	/* Computations are performed on unsigned values */
+	size = (uint64) isize;
+	v = (uint64) val % size;
+
+	/* Mask to work modulo largest power of 2 less than or equal to size */
+	masklen = pg_leftmost_one_pos64(size);
+	mask = (((uint64) 1) << masklen) - 1;
+
+	/*
+	 * Permute the input value by applying 4 rounds of pseudorandom bijective
+	 * transformations.  The intention here is to distribute each input
+	 * uniformly randomly across the range, and separate adjacent inputs
+	 * approximately uniformly randomly from each other, leading to a fairly
+	 * random overall choice of permutation.
+	 *
+	 * To separate adjacent inputs, we multiply by a random number modulo
+	 * (mask + 1), which is a power of 2.  For this to be a bijection, the
+	 * multiplier must be odd.  Since this is known to lead to less randomness
+	 * in the lower bits, we also apply a rotation that shifts the topmost bit
+	 * into the least significant bit.  In the special cases where size <= 3,
+	 * mask = 1 and each of these operations is actually a no-op, so we also
+	 * XOR with a different random number to inject additional randomness.
+	 * Since the size is generally not a power of 2, we apply this bijection
+	 * on overlapping upper and lower halves of the input.
+	 *
+	 * To distribute the inputs uniformly across the range, we then also apply
+	 * a random offset modulo the full range.
+	 *
+	 * Taken together, these operations resemble a modified linear
+	 * congruential generator, as is commonly used in pseudorandom number
+	 * generators.  Empirically, it is found that for small sizes it selects
+	 * each of the (size!) possible permutations with roughly equal
+	 * probability.  For larger sizes, not all permutations can be generated,
+	 * but the intended random spread is still produced.
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		uint64		m,
+					r,
+					t;
+
+		/* Random multiply (by an odd number), XOR and rotate of lower half */
+		m = randu64(&random_state1) | 1;
+		r = randu64(&random_state2);
+		if (v <= mask)
+		{
+			v = ((v * m) ^ r) & mask;
+			v = ((v << 1) & mask) | (v >> (masklen - 1));
+		}
+
+		/* Random multiply (by an odd number), XOR and rotate of upper half */
+		m = randu64(&random_state1) | 1;
+		r = randu64(&random_state2);
+		t = size - 1 - v;
+		if (t <= mask)
+		{
+			t = ((t * m) ^ r) & mask;
+			t = ((t << 1) & mask) | (t >> (masklen - 1));
+			v = size - 1 - t;
+		}
+
+		/* Random offset */
+		r = randu64(&random_state2);
+		v = (v + r) % size;
+	}
+
+	return (int64) v;
+}
+
 /*
  * Initialize the given SimpleStats struct to all zeroes
  */
@@ -2475,6 +2590,29 @@ evalStandardFunc(CState *st,
 				return true;
 			}
 
+		case PGBENCH_PERMUTE:
+			{
+				int64		val,
+							size,
+							seed;
+
+				Assert(nargs == 3);
+
+				if (!coerceToInt(&vargs[0], &val) ||
+					!coerceToInt(&vargs[1], &size) ||
+					!coerceToInt(&vargs[2], &seed))
+					return false;
+
+				if (size <= 0)
+				{
+					pg_log_error("permute size parameter must be greater than zero");
+					return false;
+				}
+
+				setIntValue(retval, permute(val, size, seed));
+				return true;
+			}
+
 		default:
 			/* cannot get here */
 			Assert(0);
diff --git a/src/bin/pgbench/pgbench.h b/src/bin/pgbench/pgbench.h
index 3a9d89e6f1..6ce1c98649 100644
--- a/src/bin/pgbench/pgbench.h
+++ b/src/bin/pgbench/pgbench.h
@@ -99,7 +99,8 @@ typedef enum PgBenchFunction
 	PGBENCH_IS,
 	PGBENCH_CASE,
 	PGBENCH_HASH_FNV1A,
-	PGBENCH_HASH_MURMUR2
+	PGBENCH_HASH_MURMUR2,
+	PGBENCH_PERMUTE
 } PgBenchFunction;
 
 typedef struct PgBenchExpr PgBenchExpr;
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index 82a46c72b6..d2b7f2efcb 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -4,6 +4,7 @@ use warnings;
 use PostgresNode;
 use TestLib;
 use Test::More;
+use Config;
 
 # start a pgbench specific server
 my $node = get_new_node('main');
@@ -483,6 +484,17 @@ pgbench(
 		qr{command=98.: int 5432\b},                    # :random_seed
 		qr{command=99.: int -9223372036854775808\b},    # min int
 		qr{command=100.: int 9223372036854775807\b},    # max int
+		# pseudorandom permutation tests
+		qr{command=101.: boolean true\b},
+		qr{command=102.: boolean true\b},
+		qr{command=103.: boolean true\b},
+		qr{command=104.: boolean true\b},
+		qr{command=105.: boolean true\b},
+		qr{command=109.: boolean true\b},
+		qr{command=110.: boolean true\b},
+		qr{command=111.: boolean true\b},
+		qr{command=112.: int 9223372036854775797\b},
+		qr{command=113.: boolean true\b},
 	],
 	'pgbench expressions',
 	{
@@ -610,6 +622,33 @@ SELECT :v0, :v1, :v2, :v3;
 -- minint constant parsing
 \set min debug(-9223372036854775808)
 \set max debug(-(:min + 1))
+-- parametric pseudorandom permutation function
+\set t debug(permute(0, 2) + permute(1, 2) = 1)
+\set t debug(permute(0, 3) + permute(1, 3) + permute(2, 3) = 3)
+\set t debug(permute(0, 4) + permute(1, 4) + permute(2, 4) + permute(3, 4) = 6)
+\set t debug(permute(0, 5) + permute(1, 5) + permute(2, 5) + permute(3, 5) + permute(4, 5) = 10)
+\set t debug(permute(0, 16) + permute(1, 16) + permute(2, 16) + permute(3, 16) + \
+             permute(4, 16) + permute(5, 16) + permute(6, 16) + permute(7, 16) + \
+             permute(8, 16) + permute(9, 16) + permute(10, 16) + permute(11, 16) + \
+             permute(12, 16) + permute(13, 16) + permute(14, 16) + permute(15, 16) = 120)
+-- random sanity checks
+\set size random(2, 1000)
+\set v random(0, :size - 1)
+\set p permute(:v, :size)
+\set t debug(0 <= :p and :p < :size and :p = permute(:v + :size, :size) and :p <> permute(:v + 1, :size))
+-- actual values
+\set t debug(permute(:v, 1) = 0)
+\set t debug(permute(0, 2, 5432) = 0 and permute(1, 2, 5432) = 1 and \
+             permute(0, 2, 5435) = 1 and permute(1, 2, 5435) = 0)
+-- 63 bits tests
+\set size debug(:max - 10)
+\set t debug(permute(:size-1, :size, 5432) = 6027738441054679063 and \
+             permute(:size-2, :size, 5432) = 3883756639104291588 and \
+             permute(:size-3, :size, 5432) = 3407620656580533607 and \
+             permute(:size-4, :size, 5432) = 4439146987379709618 and \
+             permute(:size-5, :size, 5432) = 2736158554213998904 and \
+             permute(:size-6, :size, 5432) = 1448350938097189462 and \
+             permute(:size-7, :size, 5432) = 3955131470409280934)
 }
 	});
 
@@ -1048,6 +1087,10 @@ SELECT LEAST(} . join(', ', (':i') x 256) . q{)}
 		'bad boolean',                     2,
 		[qr{malformed variable.*trueXXX}], q{\set b :badtrue or true}
 	],
+	[
+		'invalid permute size',				2,
+		[qr{permute size parameter must be greater than zero}], q{\set i permute(0, 0)}
+	],
 
 	# GSET
 	[
diff --git a/src/bin/pgbench/t/002_pgbench_no_server.pl b/src/bin/pgbench/t/002_pgbench_no_server.pl
index e38c7d77d1..4027e68dfa 100644
--- a/src/bin/pgbench/t/002_pgbench_no_server.pl
+++ b/src/bin/pgbench/t/002_pgbench_no_server.pl
@@ -341,6 +341,16 @@ my @script_tests = (
 		'set i',
 		[ qr{set i 1 }, qr{\^ error found here} ],
 		{ 'set_i_op' => "\\set i 1 +\n" }
+	],
+	[
+		'not enough arguments to permute',
+		[qr{unexpected number of arguments \(permute\)}],
+		{ 'bad-permute-1.sql' => "\\set i permute(1)\n" }
+	],
+	[
+		'too many arguments to permute',
+		[qr{unexpected number of arguments \(permute\)}],
+		{ 'bad-permute-2.sql' => "\\set i permute(1, 2, 3, 4)\n" }
 	],);
 
 for my $t (@script_tests)

Re: pgbench - add pseudo-random permutation function

Reply via email to