See attached v27 proposal.
As usual, it is easier to see with the actual attachement:-)
--
Fabien.
diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 50cf22ba6b..84d9566f49 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -1057,7 +1057,7 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
<row>
<entry> <literal>default_seed</literal> </entry>
- <entry>seed used in hash functions by default</entry>
+ <entry>seed used in hash and pseudorandom permutation functions by default</entry>
</row>
<row>
@@ -1864,6 +1864,24 @@ SELECT 4 AS four \; SELECT 5 AS five \aset
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <function>permute</function> ( <parameter>i</parameter>, <parameter>size</parameter> [, <parameter>seed</parameter> ] )
+ <returnvalue>integer</returnvalue>
+ </para>
+ <para>
+ Permuted value of <parameter>i</parameter>, in the range
+ <literal>[0, size)</literal>. This is the new position of
+ <parameter>i</parameter> (modulo <parameter>size</parameter>) in a
+ pseudorandom permutation of the integers <literal>0...size-1</literal>,
+ parameterized by <parameter>seed</parameter>.
+ </para>
+ <para>
+ <literal>permute(0, 4)</literal>
+ <returnvalue>an integer between 0 and 3</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<function>pi</function> ()
@@ -2071,29 +2089,70 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
</listitem>
</itemizedlist>
+ <note>
+ <para>
+ When designing a benchmark which selects rows non-uniformly, be aware
+ that the rows chosen may be correlated with other data such as IDs from
+ a sequence or the physical row ordering, which may skew performance
+ measurements.
+ </para>
+ <para>
+ To avoid this, you may wish to use the <function>permute</function>
+ function, or some other additional step with similar effect, to shuffle
+ the selected rows and remove such correlations.
+ </para>
+ </note>
+
<para>
Hash functions <literal>hash</literal>, <literal>hash_murmur2</literal> and
<literal>hash_fnv1a</literal> accept an input value and an optional seed parameter.
In case the seed isn't provided the value of <literal>:default_seed</literal>
is used, which is initialized randomly unless set by the command-line
- <literal>-D</literal> option. Hash functions can be used to scatter the
- distribution of random functions such as <literal>random_zipfian</literal> or
- <literal>random_exponential</literal>. For instance, the following pgbench
- script simulates possible real world workload typical for social media and
- blogging platforms where few accounts generate excessive load:
+ <literal>-D</literal> option.
+ </para>
+
+ <para>
+ <literal>permute</literal> accepts an input value, a size, and an optional
+ seed parameter. It generates a pseudorandom permutation of integers in
+ the range <literal>[0, size)</literal>, and returns the index of the input
+ value in the permuted values. The permutation chosen is parameterized by
+ the seed, which defaults to <literal>:default_seed</literal>, if not
+ specified. Unlike the hash functions, <literal>permute</literal> ensures
+ that there are no collisions or holes in the output values. Input values
+ outside the interval are interpreted modulo the size. The function raises
+ an error if the size is not positive. <function>permute</function> can be
+ used to scatter the distribution of non-uniform random functions such as
+ <literal>random_zipfian</literal> or <literal>random_exponential</literal>
+ so that values drawn more often are not trivially correlated. For
+ instance, the following <application>pgbench</application> script
+ simulates a possible real world workload typical for social media and
+ blogging platforms where a few accounts generate excessive load:
<programlisting>
-\set r random_zipfian(0, 100000000, 1.07)
-\set k abs(hash(:r)) % 1000000
+\set size 1000000
+\set r random_zipfian(1, :size, 1.07)
+\set k 1 + permute(:r, :size)
</programlisting>
In some cases several distinct distributions are needed which don't correlate
- with each other and this is when implicit seed parameter comes in handy:
+ with each other and this is when the optional seed parameter comes in handy:
<programlisting>
-\set k1 abs(hash(:r, :default_seed + 123)) % 1000000
-\set k2 abs(hash(:r, :default_seed + 321)) % 1000000
+\set k1 1 + permute(:r, :size, :default_seed + 123)
+\set k2 1 + permute(:r, :size, :default_seed + 321)
</programlisting>
+
+ A similar behavior can also be approximated with <function>hash</function>:
+
+<programlisting>
+\set size 1000000
+\set r random_zipfian(1, 100 * :size, 1.07)
+\set k 1 + abs(hash(:r)) % :size
+</programlisting>
+
+ However, since <function>hash</function> generates collisions, some values
+ will not be reachable and others will be more frequent than expected from
+ the original distribution.
</para>
<para>
diff --git a/src/bin/pgbench/exprparse.y b/src/bin/pgbench/exprparse.y
index 4d529ea550..56f75ccd25 100644
--- a/src/bin/pgbench/exprparse.y
+++ b/src/bin/pgbench/exprparse.y
@@ -19,6 +19,7 @@
#define PGBENCH_NARGS_VARIABLE (-1)
#define PGBENCH_NARGS_CASE (-2)
#define PGBENCH_NARGS_HASH (-3)
+#define PGBENCH_NARGS_PERMUTE (-4)
PgBenchExpr *expr_parse_result;
@@ -370,6 +371,9 @@ static const struct
{
"hash_fnv1a", PGBENCH_NARGS_HASH, PGBENCH_HASH_FNV1A
},
+ {
+ "permute", PGBENCH_NARGS_PERMUTE, PGBENCH_PERMUTE
+ },
/* keep as last array element */
{
NULL, 0, 0
@@ -482,6 +486,19 @@ make_func(yyscan_t yyscanner, int fnumber, PgBenchExprList *args)
}
break;
+ /* pseudorandom permutation function with optional seed argument */
+ case PGBENCH_NARGS_PERMUTE:
+ if (len < 2 || len > 3)
+ expr_yyerror_more(yyscanner, "unexpected number of arguments",
+ PGBENCH_FUNCTIONS[fnumber].fname);
+
+ if (len == 2)
+ {
+ PgBenchExpr *var = make_variable("default_seed");
+ args = make_elist(var, args);
+ }
+ break;
+
/* common case: positive arguments number */
default:
Assert(PGBENCH_FUNCTIONS[fnumber].nargs >= 0);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 48ce1712cc..63a4b24e54 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -66,6 +66,7 @@
#include "getopt_long.h"
#include "libpq-fe.h"
#include "pgbench.h"
+#include "port/pg_bitutils.h"
#include "portability/instr_time.h"
#ifndef M_PI
@@ -1127,6 +1128,120 @@ getHashMurmur2(int64 val, uint64 seed)
return (int64) result;
}
+static uint64
+randu64(RandomState * state)
+{
+ uint64 r1 = pg_jrand48((*state).xseed),
+ r2 = pg_jrand48((*state).xseed);
+ return r1 << 51 | r2 << 13 | r1 >> 13;
+}
+
+
+/*
+ * Pseudorandom permutation function
+ *
+ * For small sizes, this generates each of the (size!) possible permutations
+ * of integers in the range [0, size) with roughly equal probability. Once
+ * the size is larger than 16, the number of possible permutations exceeds the
+ * number of distinct states of the internal pseudorandom number generator,
+ * and so not all possible permutations can be generated, but the permutations
+ * chosen should continue to give the appearance of being random.
+ *
+ * THIS FUNCTION IS NOT CRYPTOGRAPHICALLY SECURE.
+ * DO NOT USE FOR SUCH PURPOSE.
+ */
+static int64
+permute(const int64 val, const int64 isize, const int64 seed)
+{
+ RandomState random_state1;
+ RandomState random_state2;
+ uint64 size;
+ uint64 v;
+ int masklen;
+ uint64 mask;
+ int i;
+
+ if (isize < 2)
+ return 0; /* nothing to permute */
+
+ /* Initialize a pair of random states using the seed */
+ random_state1.xseed[0] = seed & 0xFFFF;
+ random_state1.xseed[1] = (seed >> 16) & 0xFFFF;
+ random_state1.xseed[2] = (seed >> 32) & 0xFFFF;
+
+ random_state2.xseed[0] = (((uint64) seed) >> 48) & 0xFFFF;
+ random_state2.xseed[1] = seed & 0xFFFF;
+ random_state2.xseed[2] = (seed >> 16) & 0xFFFF;
+
+ /* Computations are performed on unsigned values */
+ size = (uint64) isize;
+ v = (uint64) val % size;
+
+ /* Mask to work modulo largest power of 2 less than or equal to size */
+ masklen = pg_leftmost_one_pos64(size);
+ mask = (((uint64) 1) << masklen) - 1;
+
+ /*
+ * Permute the input value by applying 4 rounds of pseudorandom bijective
+ * transformations. The intention here is to distribute each input
+ * uniformly randomly across the range, and separate adjacent inputs
+ * approximately uniformly randomly from each other, leading to a fairly
+ * random overall choice of permutation.
+ *
+ * To separate adjacent inputs, we multiply by a random number modulo
+ * (mask + 1), which is a power of 2. For this to be a bijection, the
+ * multiplier must be odd. Since this is known to lead to less randomness
+ * in the lower bits, we also apply a rotation that shifts the topmost bit
+ * into the least significant bit. In the special cases where size <= 3,
+ * mask = 1 and each of these operations is actually a no-op, so we also
+ * XOR with a different random number to inject additional randomness.
+ * Since the size is generally not a power of 2, we apply this bijection
+ * on overlapping upper and lower halves of the input.
+ *
+ * To distribute the inputs uniformly across the range, we then also apply
+ * a random offset modulo the full range.
+ *
+ * Taken together, these operations resemble a modified linear
+ * congruential generator, as is commonly used in pseudorandom number
+ * generators. Empirically, it is found that for small sizes it selects
+ * each of the (size!) possible permutations with roughly equal
+ * probability. For larger sizes, not all permutations can be generated,
+ * but the intended random spread is still produced.
+ */
+ for (i = 0; i < 4; i++)
+ {
+ uint64 m,
+ r,
+ t;
+
+ /* Random multiply (by an odd number), XOR and rotate of lower half */
+ m = randu64(&random_state1) | 1;
+ r = randu64(&random_state2);
+ if (v <= mask)
+ {
+ v = ((v * m) ^ r) & mask;
+ v = ((v << 1) & mask) | (v >> (masklen - 1));
+ }
+
+ /* Random multiply (by an odd number), XOR and rotate of upper half */
+ m = randu64(&random_state1) | 1;
+ r = randu64(&random_state2);
+ t = size - 1 - v;
+ if (t <= mask)
+ {
+ t = ((t * m) ^ r) & mask;
+ t = ((t << 1) & mask) | (t >> (masklen - 1));
+ v = size - 1 - t;
+ }
+
+ /* Random offset */
+ r = randu64(&random_state2);
+ v = (v + r) % size;
+ }
+
+ return (int64) v;
+}
+
/*
* Initialize the given SimpleStats struct to all zeroes
*/
@@ -2475,6 +2590,29 @@ evalStandardFunc(CState *st,
return true;
}
+ case PGBENCH_PERMUTE:
+ {
+ int64 val,
+ size,
+ seed;
+
+ Assert(nargs == 3);
+
+ if (!coerceToInt(&vargs[0], &val) ||
+ !coerceToInt(&vargs[1], &size) ||
+ !coerceToInt(&vargs[2], &seed))
+ return false;
+
+ if (size <= 0)
+ {
+ pg_log_error("permute size parameter must be greater than zero");
+ return false;
+ }
+
+ setIntValue(retval, permute(val, size, seed));
+ return true;
+ }
+
default:
/* cannot get here */
Assert(0);
diff --git a/src/bin/pgbench/pgbench.h b/src/bin/pgbench/pgbench.h
index 3a9d89e6f1..6ce1c98649 100644
--- a/src/bin/pgbench/pgbench.h
+++ b/src/bin/pgbench/pgbench.h
@@ -99,7 +99,8 @@ typedef enum PgBenchFunction
PGBENCH_IS,
PGBENCH_CASE,
PGBENCH_HASH_FNV1A,
- PGBENCH_HASH_MURMUR2
+ PGBENCH_HASH_MURMUR2,
+ PGBENCH_PERMUTE
} PgBenchFunction;
typedef struct PgBenchExpr PgBenchExpr;
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index 82a46c72b6..d2b7f2efcb 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -4,6 +4,7 @@ use warnings;
use PostgresNode;
use TestLib;
use Test::More;
+use Config;
# start a pgbench specific server
my $node = get_new_node('main');
@@ -483,6 +484,17 @@ pgbench(
qr{command=98.: int 5432\b}, # :random_seed
qr{command=99.: int -9223372036854775808\b}, # min int
qr{command=100.: int 9223372036854775807\b}, # max int
+ # pseudorandom permutation tests
+ qr{command=101.: boolean true\b},
+ qr{command=102.: boolean true\b},
+ qr{command=103.: boolean true\b},
+ qr{command=104.: boolean true\b},
+ qr{command=105.: boolean true\b},
+ qr{command=109.: boolean true\b},
+ qr{command=110.: boolean true\b},
+ qr{command=111.: boolean true\b},
+ qr{command=112.: int 9223372036854775797\b},
+ qr{command=113.: boolean true\b},
],
'pgbench expressions',
{
@@ -610,6 +622,33 @@ SELECT :v0, :v1, :v2, :v3;
-- minint constant parsing
\set min debug(-9223372036854775808)
\set max debug(-(:min + 1))
+-- parametric pseudorandom permutation function
+\set t debug(permute(0, 2) + permute(1, 2) = 1)
+\set t debug(permute(0, 3) + permute(1, 3) + permute(2, 3) = 3)
+\set t debug(permute(0, 4) + permute(1, 4) + permute(2, 4) + permute(3, 4) = 6)
+\set t debug(permute(0, 5) + permute(1, 5) + permute(2, 5) + permute(3, 5) + permute(4, 5) = 10)
+\set t debug(permute(0, 16) + permute(1, 16) + permute(2, 16) + permute(3, 16) + \
+ permute(4, 16) + permute(5, 16) + permute(6, 16) + permute(7, 16) + \
+ permute(8, 16) + permute(9, 16) + permute(10, 16) + permute(11, 16) + \
+ permute(12, 16) + permute(13, 16) + permute(14, 16) + permute(15, 16) = 120)
+-- random sanity checks
+\set size random(2, 1000)
+\set v random(0, :size - 1)
+\set p permute(:v, :size)
+\set t debug(0 <= :p and :p < :size and :p = permute(:v + :size, :size) and :p <> permute(:v + 1, :size))
+-- actual values
+\set t debug(permute(:v, 1) = 0)
+\set t debug(permute(0, 2, 5432) = 0 and permute(1, 2, 5432) = 1 and \
+ permute(0, 2, 5435) = 1 and permute(1, 2, 5435) = 0)
+-- 63 bits tests
+\set size debug(:max - 10)
+\set t debug(permute(:size-1, :size, 5432) = 6027738441054679063 and \
+ permute(:size-2, :size, 5432) = 3883756639104291588 and \
+ permute(:size-3, :size, 5432) = 3407620656580533607 and \
+ permute(:size-4, :size, 5432) = 4439146987379709618 and \
+ permute(:size-5, :size, 5432) = 2736158554213998904 and \
+ permute(:size-6, :size, 5432) = 1448350938097189462 and \
+ permute(:size-7, :size, 5432) = 3955131470409280934)
}
});
@@ -1048,6 +1087,10 @@ SELECT LEAST(} . join(', ', (':i') x 256) . q{)}
'bad boolean', 2,
[qr{malformed variable.*trueXXX}], q{\set b :badtrue or true}
],
+ [
+ 'invalid permute size', 2,
+ [qr{permute size parameter must be greater than zero}], q{\set i permute(0, 0)}
+ ],
# GSET
[
diff --git a/src/bin/pgbench/t/002_pgbench_no_server.pl b/src/bin/pgbench/t/002_pgbench_no_server.pl
index e38c7d77d1..4027e68dfa 100644
--- a/src/bin/pgbench/t/002_pgbench_no_server.pl
+++ b/src/bin/pgbench/t/002_pgbench_no_server.pl
@@ -341,6 +341,16 @@ my @script_tests = (
'set i',
[ qr{set i 1 }, qr{\^ error found here} ],
{ 'set_i_op' => "\\set i 1 +\n" }
+ ],
+ [
+ 'not enough arguments to permute',
+ [qr{unexpected number of arguments \(permute\)}],
+ { 'bad-permute-1.sql' => "\\set i permute(1)\n" }
+ ],
+ [
+ 'too many arguments to permute',
+ [qr{unexpected number of arguments \(permute\)}],
+ { 'bad-permute-2.sql' => "\\set i permute(1, 2, 3, 4)\n" }
],);
for my $t (@script_tests)