Re: pgbench - add pseudo-random permutation function

Fabien COELHO Sun, 14 Mar 2021 03:18:18 -0700

See attached v22.


v23:
 - replaces remaining occurences of "pr_perm" in the documentation
 - removes duplicated includes

--
Fabien.

diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 299d93b241..9f49a6a78d 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -1057,7 +1057,7 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
 
       <row>
        <entry> <literal>default_seed</literal> </entry>
-       <entry>seed used in hash functions by default</entry>
+       <entry>seed used in hash and pseudo-random permutation functions by default</entry>
       </row>
 
       <row>
@@ -1842,6 +1842,22 @@ SELECT 4 AS four \; SELECT 5 AS five \aset
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <function>permute</function> ( <parameter>i</parameter>, <parameter>size</parameter> [, <parameter>seed</parameter> ] )
+        <returnvalue>integer</returnvalue>
+       </para>
+       <para>
+        Permuted value of <parameter>i</parameter>, in <literal>[0,size)</literal>.
+        This is the new position of <parameter>i</parameter> in a pseudo-random rearrangement of
+        <parameter>size</parameter> first integers parameterized by <parameter>seed</parameter>.
+       </para>
+       <para>
+        <literal>permute(0, 4)</literal>
+        <returnvalue>an integer between 0 and 3</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <function>pi</function> ()
@@ -1960,6 +1976,20 @@ SELECT 4 AS four \; SELECT 5 AS five \aset
     shape of the distribution.
    </para>
 
+   <note>
+    <para>
+      When designing a benchmark which selects rows non-uniformly, be aware
+      that using <application>pgbench</application> non-uniform random functions
+      directly leads to unduly correlations: rows with close ids come out with
+      similar probability, skewing performance measures because they also reside
+      in close pages.
+    </para>
+    <para>
+      You must use <function>permute</function> to shuffle the selected ids, or
+      some other additional step with similar effect, to avoid these skewing impacts.
+    </para>
+   </note>
+
    <itemizedlist>
     <listitem>
      <para>
@@ -2054,24 +2084,54 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
     <literal>hash_fnv1a</literal> accept an input value and an optional seed parameter.
     In case the seed isn't provided the value of <literal>:default_seed</literal>
     is used, which is initialized randomly unless set by the command-line
-    <literal>-D</literal> option. Hash functions can be used to scatter the
-    distribution of random functions such as <literal>random_zipfian</literal> or
-    <literal>random_exponential</literal>. For instance, the following pgbench
-    script simulates possible real world workload typical for social media and
+    <literal>-D</literal> option.
+  </para>
+
+  <para>
+    Function <literal>permute</literal> implements a parametric pseudo-random permutation
+    on an arbitrary size.
+    It allows to shuffle output of non uniform random functions so that 
+    values drawn more often are not trivially correlated.
+    It permutes integers in [0, size) using a seed by applying rounds of
+    simple invertible functions, similarly to an encryption function,
+    although beware that it is not at all cryptographically secure.
+    Compared to <literal>hash</literal> functions discussed above, the function
+    ensures that a perfect permutation is applied: there are neither collisions
+    nor holes in the output values.
+    Values outside the interval are interpreted modulo the size.
+    The function raises an error if size is not positive.
+    If no seed is provided, <literal>:default_seed</literal> is used.
+
+    For instance, the following <application>pgbench</application> script
+    simulates possible real world workload typical for social media and
     blogging platforms where few accounts generate excessive load:
 
 <programlisting>
-\set r random_zipfian(0, 100000000, 1.07)
-\set k abs(hash(:r)) % 1000000
+\set size 1000000
+\set r random_zipfian(1, :size, 1.07)
+\set k 1 + permute(:r, :size)
 </programlisting>
 
     In some cases several distinct distributions are needed which don't correlate
     with each other and this is when implicit seed parameter comes in handy:
 
 <programlisting>
-\set k1 abs(hash(:r, :default_seed + 123)) % 1000000
-\set k2 abs(hash(:r, :default_seed + 321)) % 1000000
+\set k1 1 + permute(:r, :size, :default_seed + 123)
+\set k2 1 + permute(:r, :size, :default_seed + 321)
 </programlisting>
+
+    An similar behavior can also be approximated with
+    <function>hash</function>:
+
+<programlisting>
+\set size 1000000
+\set r random_zipfian(1, 100 * :size, 1.07)
+\set k 1 + abs(hash(:r)) % :size
+</programlisting>
+
+    As this hash-modulo version generates collisions, some
+    <literal>id</literal> would not be reachable and others would come more often
+    than the target distribution.
   </para>
 
   <para>
diff --git a/src/bin/pgbench/exprparse.y b/src/bin/pgbench/exprparse.y
index 4d529ea550..73514a0a47 100644
--- a/src/bin/pgbench/exprparse.y
+++ b/src/bin/pgbench/exprparse.y
@@ -19,6 +19,7 @@
 #define PGBENCH_NARGS_VARIABLE	(-1)
 #define PGBENCH_NARGS_CASE		(-2)
 #define PGBENCH_NARGS_HASH		(-3)
+#define PGBENCH_NARGS_PERMUTE	(-4)
 
 PgBenchExpr *expr_parse_result;
 
@@ -370,6 +371,9 @@ static const struct
 	{
 		"hash_fnv1a", PGBENCH_NARGS_HASH, PGBENCH_HASH_FNV1A
 	},
+	{
+		"permute", PGBENCH_NARGS_PERMUTE, PGBENCH_PERMUTE
+	},
 	/* keep as last array element */
 	{
 		NULL, 0, 0
@@ -482,6 +486,19 @@ make_func(yyscan_t yyscanner, int fnumber, PgBenchExprList *args)
 			}
 			break;
 
+		/* pseudo-random permutation function with optional seed argument */
+		case PGBENCH_NARGS_PERMUTE:
+			if (len < 2 || len > 3)
+				expr_yyerror_more(yyscanner, "unexpected number of arguments",
+								  PGBENCH_FUNCTIONS[fnumber].fname);
+
+			if (len == 2)
+			{
+				PgBenchExpr *var = make_variable("default_seed");
+				args = make_elist(var, args);
+			}
+			break;
+
 		/* common case: positive arguments number */
 		default:
 			Assert(PGBENCH_FUNCTIONS[fnumber].nargs >= 0);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index f6a214669c..2f74b44d4a 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -1124,6 +1124,172 @@ getHashMurmur2(int64 val, uint64 seed)
 	return (int64) result;
 }
 
+/*
+ * Parametric Pseudo-random Permutation
+ */
+
+/* 16 so that % 16 can be optimized to & 0x0f */
+#define PPP_PRIMES 16
+/*
+ * 24 bit mega primes from https://primes.utm.edu/lists/small/millions/
+ * the i-th prime, i \in [0, 16), is the first prime above 2^23 + i * 2^19
+ */
+static uint64 primes[PPP_PRIMES] = {
+	UINT64CONST(8388617),
+	UINT64CONST(8912921),
+	UINT64CONST(9437189),
+	UINT64CONST(9961487),
+	UINT64CONST(10485767),
+	UINT64CONST(11010059),
+	UINT64CONST(11534351),
+	UINT64CONST(12058679),
+	UINT64CONST(12582917),
+	UINT64CONST(13107229),
+	UINT64CONST(13631489),
+	UINT64CONST(14155777),
+	UINT64CONST(14680067),
+	UINT64CONST(15204391),
+	UINT64CONST(15728681),
+	UINT64CONST(16252967)
+};
+
+/* how many "encryption" rounds to apply */
+#define PPP_ROUNDS 4
+
+/* return smallest mask holding n  */
+static uint64
+compute_mask(uint64 n)
+{
+	n |= n >> 1;
+	n |= n >> 2;
+	n |= n >> 4;
+	n |= n >> 8;
+	n |= n >> 16;
+	n |= n >> 32;
+	return n;
+}
+
+/*
+ * Compute (x * y) % m, where x and y in [0, 2^64), m in [1, 2^64).
+ */
+static uint64
+modular_multiply(uint64 x, uint64 y, const uint64 m)
+{
+#if defined(PG_INT128_TYPE)
+	return (PG_INT128_TYPE) x * (PG_INT128_TYPE) y % (PG_INT128_TYPE) m;
+#else
+	pg_log_fatal("modular_multiply not implemented on this platform");
+	pg_log_info("128 bits integer arithmetic not available");
+	exit(1);
+#endif
+}
+
+/*
+ * Donald Knuth's linear congruential generator
+ *
+ * Relying on multiplication overflows is part of the design
+ * of this simple pseudo random number generator.
+ */
+#define DK_LCG_MUL UINT64CONST(6364136223846793005)
+#define DK_LCG_INC UINT64CONST(1442695040888963407)
+
+/* do not use all small bits */
+#define LCG_SHIFT 13
+
+/*
+ * parametric pseudo-random permutation function on int64
+ *
+ * Result in [0, size) is a permutation for inputs in the same set.
+ *
+ * Note that this function does not pass statistical tests: eg
+ * permutations of 2, 3, 4 or 5 ints are not strictly equiprobable.
+ * Things worsen for large sizes as there are many more permutations
+ * (size!) than seeds to select them (2^64 < 21!).
+ * However it is inexpensive compared to an actual encryption function,
+ * and the quality is good enough to avoid trivial correlations on
+ * large sizes, which is the expected use case.
+ *
+ * THIS FUNCTION IS NOT CRYPTOGRAPHICALLY SECURE.
+ * DO NOT USE FOR SUCH PURPOSE.
+ */
+static int64
+permute(const int64 data, const int64 isize, const int64 seed)
+{
+	/* computations are performed on unsigned values */
+	uint64		size = (uint64) isize;
+	uint64		v = (uint64) data % size;
+	uint64		key = (uint64) seed;
+	/* size-1 ensures 2 possibly overlapping halves */
+	uint64		mask = compute_mask(size - 1) >> 1;
+
+	/* nothing to permute */
+	if (isize == 1)
+		return 0;
+
+	Assert(isize >= 2);
+
+	/*-----
+	 * Apply 4 rounds of bijective transformations using key updated
+	 * at each stage:
+	 *
+	 * (1) whiten: partial xors on overlapping power-of-2 subsets
+	 *     for instance with v in 0 .. 14 (i.e. with size == 15):
+	 *     if v is in 0 .. 7 do v = (v ^ k) % 8
+	 *     if v is in 7 .. 14 do v = 14 - ((14-v) ^ k) % 8
+	 *     note that because of the overlap (here 7), v may be changed twice.
+	 *     this transformation if bijective because the condition to apply it
+	 *     is still true after applying it, and xor itself is bijective on a
+	 *     power-of-2 size.
+	 *
+	 * (2) scatter: linear modulo
+	 *     v = (v * p + k) % size
+	 *     this transformation is bijective is p & size are prime, which is
+	 *     ensured in the code by the while loop which discards primes when
+	 *     size is a multiple of it.
+	 *
+	 */
+	for (unsigned int i = 0, p = key % PPP_PRIMES; i < PPP_ROUNDS; i++, p = (p + 1) % PPP_PRIMES)
+	{
+		uint64		t;
+
+		/* first "half" whitening, for v in 0 .. mask */
+		key = key * DK_LCG_MUL + DK_LCG_INC;
+		if (v <= mask)
+			v ^= (key >> LCG_SHIFT) & mask;
+
+		/* second (possibly overlapping) "half" whitening */
+		key = key * DK_LCG_MUL + DK_LCG_INC;
+		t = size - 1 - v;
+		if (t <= mask)
+		{
+			t ^= (key >> LCG_SHIFT) & mask;
+			v = size - 1 - t;
+		}
+
+		/* at most 2 primes are skipped for a given size */
+		while (unlikely(size % primes[p] == 0))
+			p = (p + 1) % PPP_PRIMES;
+
+		/* scatter values with a prime multiplication */
+		key = key * DK_LCG_MUL + DK_LCG_INC;
+
+		/* Performance shortcut for 24 bit primes, ok for size up to ~10E12 */
+		if ((v & UINT64CONST(0xffffffffff)) == v)
+			v = (primes[p] * v + (key >> LCG_SHIFT)) % size;
+		else
+			/*-----
+			 * Note: the add cannot overflow as size is under 63 bits:
+			 * mmv = mm(prime, v, size) < size <= 0x7fffffffffffffff = (1<<63)-1
+			 * ks = key >> LCG_SHIFTS <= 2^51
+			 * => mmv + ks < (1<<64) - 1
+			 */
+			v = (modular_multiply(primes[p], v, size) + (key >> LCG_SHIFT)) % size;
+	}
+
+	/* back to signed */
+	return (int64) v;
+}
+
 /*
  * Initialize the given SimpleStats struct to all zeroes
  */
@@ -2472,6 +2638,27 @@ evalStandardFunc(CState *st,
 				return true;
 			}
 
+		case PGBENCH_PERMUTE:
+			{
+				int64	val, size, seed;
+
+				Assert(nargs == 3);
+
+				if (!coerceToInt(&vargs[0], &val) ||
+					!coerceToInt(&vargs[1], &size) ||
+					!coerceToInt(&vargs[2], &seed))
+					return false;
+
+				if (size < 1)
+				{
+					fprintf(stderr, "permute size parameter must be >= 1\n");
+					return false;
+				}
+
+				setIntValue(retval, permute(val, size, seed));
+				return true;
+			}
+
 		default:
 			/* cannot get here */
 			Assert(0);
diff --git a/src/bin/pgbench/pgbench.h b/src/bin/pgbench/pgbench.h
index 3a9d89e6f1..6ce1c98649 100644
--- a/src/bin/pgbench/pgbench.h
+++ b/src/bin/pgbench/pgbench.h
@@ -99,7 +99,8 @@ typedef enum PgBenchFunction
 	PGBENCH_IS,
 	PGBENCH_CASE,
 	PGBENCH_HASH_FNV1A,
-	PGBENCH_HASH_MURMUR2
+	PGBENCH_HASH_MURMUR2,
+	PGBENCH_PERMUTE
 } PgBenchFunction;
 
 typedef struct PgBenchExpr PgBenchExpr;
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index daffc18e52..2dd1860e71 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -4,6 +4,7 @@ use warnings;
 use PostgresNode;
 use TestLib;
 use Test::More;
+use Config;
 
 # start a pgbench specific server
 my $node = get_new_node('main');
@@ -467,6 +468,15 @@ pgbench(
 		qr{command=98.: int 5432\b},                    # :random_seed
 		qr{command=99.: int -9223372036854775808\b},    # min int
 		qr{command=100.: int 9223372036854775807\b},    # max int
+		# PRP tests
+		qr{command=101.: boolean true\b},
+		qr{command=102.: boolean true\b},
+		qr{command=103.: boolean true\b},
+		qr{command=104.: boolean true\b},
+		qr{command=105.: boolean true\b},
+		qr{command=109.: boolean true\b},
+		qr{command=110.: boolean true\b},
+		qr{command=111.: boolean true\b},
 	],
 	'pgbench expressions',
 	{
@@ -594,9 +604,58 @@ SELECT :v0, :v1, :v2, :v3;
 -- minint constant parsing
 \set min debug(-9223372036854775808)
 \set max debug(-(:min + 1))
+-- parametric pseudo-random permutation
+\set t debug(permute(0, 2) + permute(1, 2) = 1)
+\set t debug(permute(0, 3) + permute(1, 3) + permute(2, 3) = 3)
+\set t debug(permute(0, 4) + permute(1, 4) + permute(2, 4) + permute(3, 4) = 6)
+\set t debug(permute(0, 5) + permute(1, 5) + permute(2, 5) + permute(3, 5) + permute(4, 5) = 10)
+\set t debug(permute(0, 16) + permute(1, 16) + permute(2, 16) + permute(3, 16) + \
+             permute(4, 16) + permute(5, 16) + permute(6, 16) + permute(7, 16) + \
+             permute(8, 16) + permute(9, 16) + permute(10, 16) + permute(11, 16) + \
+             permute(12, 16) + permute(13, 16) + permute(14, 16) + permute(15, 16) = 120)
+-- random sanity check
+\set size random(2, 1000)
+\set v random(0, :size - 1)
+\set p permute(:v, :size)
+\set t debug(0 <= :p and :p < :size and :p = permute(:v + :size, :size) and :p <> permute(:v + 1, :size))
+-- actual values
+\set t debug(permute(:v, 1) = 0)
+\set t debug(permute(0, 2, 5432) = 0 and permute(1, 2, 5432) = 1 and \
+             permute(0, 2, 5431) = 1 and permute(1, 2, 5431) = 0)
 }
 	});
 
+# some PPP tests require 128-bit int support.
+pgbench(
+	'-t 1',
+	0,
+	[ qr{type: .*/001_pgbench_permute}, qr{processed: 1/1} ],
+	[
+		qr{command=3.: boolean true\b},
+		qr{command=4.: int 9223372036854775797\b},
+		qr{command=5.: boolean true\b},
+	],
+	'pgbench permute',
+	{
+		'001_pgbench_permute' => q{
+\set min debug(-9223372036854775808)
+\set max debug(-(:min + 1))
+-- ~50 bits test to exercise full modular multiplication
+\set t debug(permute(0, 1000000000000000, 54321) = 9406454989259 and \
+             permute(1999999999999999, 1000000000000000, 54321) = 54570773902028 and \
+             permute(2500000000000000, 1000000000000000, 54321) = 771082311307468)
+-- 63 bits tests
+\set size debug(:max - 10)
+\set ok debug(permute(:size-1, :size, 5432) = 7214172101785397543 and \
+              permute(:size-2, :size, 5432) = 4060085360303920649 and \
+              permute(:size-3, :size, 5432) =  919477102797071521 and \
+              permute(:size-4, :size, 5432) = 7588404289602340497 and \
+              permute(:size-5, :size, 5432) = 5568354808723584469 and \
+              permute(:size-6, :size, 5432) = 2410458883211907565 and \
+              permute(:size-7, :size, 5432) = 1738667467693569814)
+}
+	}) unless $Config{'osname'} eq 'MSWin32';
+
 # random determinism when seeded
 $node->safe_psql('postgres',
 	'CREATE UNLOGGED TABLE seeded_random(seed INT8 NOT NULL, rand TEXT NOT NULL, val INTEGER NOT NULL);'
@@ -955,6 +1014,10 @@ SELECT LEAST(} . join(', ', (':i') x 256) . q{)}
 		'bad boolean',                     2,
 		[qr{malformed variable.*trueXXX}], q{\set b :badtrue or true}
 	],
+	[
+		'invalid permute size',				2,
+		[qr{permute size parameter must be >= 1}], q{\set i permute(0, 0)}
+	],
 
 	# GSET
 	[
diff --git a/src/bin/pgbench/t/002_pgbench_no_server.pl b/src/bin/pgbench/t/002_pgbench_no_server.pl
index e38c7d77d1..4027e68dfa 100644
--- a/src/bin/pgbench/t/002_pgbench_no_server.pl
+++ b/src/bin/pgbench/t/002_pgbench_no_server.pl
@@ -341,6 +341,16 @@ my @script_tests = (
 		'set i',
 		[ qr{set i 1 }, qr{\^ error found here} ],
 		{ 'set_i_op' => "\\set i 1 +\n" }
+	],
+	[
+		'not enough arguments to permute',
+		[qr{unexpected number of arguments \(permute\)}],
+		{ 'bad-permute-1.sql' => "\\set i permute(1)\n" }
+	],
+	[
+		'too many arguments to permute',
+		[qr{unexpected number of arguments \(permute\)}],
+		{ 'bad-permute-2.sql' => "\\set i permute(1, 2, 3, 4)\n" }
 	],);
 
 for my $t (@script_tests)

Re: pgbench - add pseudo-random permutation function

Reply via email to