- lrand48 (48 bits state as 3 uint16) is 29 ops
(10 =, 8 *, 7 +, 4 >>)
- xoshiro256** (256 bits states as 4 uint64) is 24 ops (18 if rot in hw)
8 =, 2 *, 2 +, 5 <<, 5 ^, 2 |
See http://vigna.di.unimi.it/xorshift/
Small benchmark on my laptop with gcc-7.3 -O3:
- pg_lrand48 takes 4.0 seconds to generate 1 billion 32-bit ints
- xoshiro256** takes 1.6 seconds to generate 1 billion 64-bit ints
With -O2 it is 4.8 and 3.4 seconds, respectively. So significantly better
speed _and_ quality are quite achievable.
Note that small attempt at optimizing these functions (inline constants,
array replaced with scalars) did not yield significant improvements.
--
Fabien.
// pg erand48 implementation
#include <stdio.h>
#define RAND48_SEED_0 (0x330e)
#define RAND48_SEED_1 (0xabcd)
#define RAND48_SEED_2 (0x1234)
#define RAND48_MULT_0 (0xe66d)
#define RAND48_MULT_1 (0xdeec)
#define RAND48_MULT_2 (0x0005)
#define RAND48_ADD (0x000b)
static unsigned short _rand48_seed[3] = {
RAND48_SEED_0,
RAND48_SEED_1,
RAND48_SEED_2
};
static unsigned short _rand48_mult[3] = {
RAND48_MULT_0,
RAND48_MULT_1,
RAND48_MULT_2
};
static unsigned short _rand48_add = RAND48_ADD;
static void
_dorand48(unsigned short xseed[3])
{
unsigned long accu;
unsigned short temp[2];
accu = (unsigned long) _rand48_mult[0] * (unsigned long) xseed[0] +
(unsigned long) _rand48_add;
temp[0] = (unsigned short) accu; /* lower 16 bits */
accu >>= sizeof(unsigned short) * 8;
accu += (unsigned long) _rand48_mult[0] * (unsigned long) xseed[1] +
(unsigned long) _rand48_mult[1] * (unsigned long) xseed[0];
temp[1] = (unsigned short) accu; /* middle 16 bits */
accu >>= sizeof(unsigned short) * 8;
accu += _rand48_mult[0] * xseed[2] + _rand48_mult[1] * xseed[1] + _rand48_mult[2] * xseed[0];
xseed[0] = temp[0];
xseed[1] = temp[1];
xseed[2] = (unsigned short) accu;
}
long
pg_lrand48(void)
{
_dorand48(_rand48_seed);
return ((long) _rand48_seed[2] << 15) + ((long) _rand48_seed[1] >> 1);
}
int main(void)
{
long l;
for (int i = 0; i < 1000000000; i++)
l = pg_lrand48();
fprintf(stdout, "%ld\n", l);
return 0;
}
// XOROSHI256**
#include <stdio.h>
#include <stdint.h>
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
static uint64_t s[4];
uint64_t next(void) {
const uint64_t result_starstar = rotl(s[1] * 5, 7) * 9;
const uint64_t t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rotl(s[3], 45);
return result_starstar;
}
int main(void)
{
uint64_t l;
s[0] = 0xdeadbeefdeadbeefLL;
s[1] = 0xdeadbeefdeadbeefLL;
s[2] = 0xdeadbeefdeadbeefLL;
s[3] = 0xdeadbeefdeadbeefLL;
// loop 1 billion times
for (int i = 0; i < 1000000000; i++)
l = next();
fprintf(stdout, "%lu\n", l);
return 0;
}