- lrand48 (48 bits state as 3 uint16)        is 29 ops
  (10 =, 8 *, 7 +, 4 >>)

- xoshiro256** (256 bits states as 4 uint64) is 24 ops (18 if rot in hw)
  8 =, 2 *, 2 +, 5 <<, 5 ^, 2 |

See http://vigna.di.unimi.it/xorshift/

Small benchmark on my laptop with gcc-7.3 -O3:

 - pg_lrand48 takes 4.0 seconds to generate 1 billion 32-bit ints

 - xoshiro256** takes 1.6 seconds to generate 1 billion 64-bit ints

With -O2 it is 4.8 and 3.4 seconds, respectively. So significantly better speed _and_ quality are quite achievable.

Note that small attempt at optimizing these functions (inline constants, array replaced with scalars) did not yield significant improvements.

--
Fabien.
// pg erand48 implementation
#include <stdio.h>

#define RAND48_SEED_0   (0x330e)
#define RAND48_SEED_1   (0xabcd)
#define RAND48_SEED_2   (0x1234)
#define RAND48_MULT_0   (0xe66d)
#define RAND48_MULT_1   (0xdeec)
#define RAND48_MULT_2   (0x0005)
#define RAND48_ADD              (0x000b)

static unsigned short _rand48_seed[3] = {
        RAND48_SEED_0,
        RAND48_SEED_1,
        RAND48_SEED_2
};
static unsigned short _rand48_mult[3] = {
        RAND48_MULT_0,
        RAND48_MULT_1,
        RAND48_MULT_2
};
static unsigned short _rand48_add = RAND48_ADD;

static void
_dorand48(unsigned short xseed[3])
{
  unsigned long accu;
  unsigned short temp[2];

  accu = (unsigned long) _rand48_mult[0] * (unsigned long) xseed[0] +
    (unsigned long) _rand48_add;
  temp[0] = (unsigned short) accu;        /* lower 16 bits */
  accu >>= sizeof(unsigned short) * 8;
  accu += (unsigned long) _rand48_mult[0] * (unsigned long) xseed[1] +
    (unsigned long) _rand48_mult[1] * (unsigned long) xseed[0];
  temp[1] = (unsigned short) accu;        /* middle 16 bits */
  accu >>= sizeof(unsigned short) * 8;
  accu += _rand48_mult[0] * xseed[2] + _rand48_mult[1] * xseed[1] + _rand48_mult[2] * xseed[0];
  xseed[0] = temp[0];
  xseed[1] = temp[1];
  xseed[2] = (unsigned short) accu;
}

long
pg_lrand48(void)
{
  _dorand48(_rand48_seed);
  return ((long) _rand48_seed[2] << 15) + ((long) _rand48_seed[1] >> 1);
}

int main(void)
{
  long l;
  for (int i = 0; i < 1000000000; i++)
    l = pg_lrand48();

  fprintf(stdout, "%ld\n", l);
  return 0;
}
// XOROSHI256**

#include <stdio.h>
#include <stdint.h>

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}

static uint64_t s[4];

uint64_t next(void) {
	const uint64_t result_starstar = rotl(s[1] * 5, 7) * 9;
	const uint64_t t = s[1] << 17;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 45);

	return result_starstar;
}

int main(void)
{
  uint64_t l;

  s[0] = 0xdeadbeefdeadbeefLL;
  s[1] = 0xdeadbeefdeadbeefLL;
  s[2] = 0xdeadbeefdeadbeefLL;
  s[3] = 0xdeadbeefdeadbeefLL;

  // loop 1 billion times
  for (int i = 0; i < 1000000000; i++)
    l = next();

  fprintf(stdout, "%lu\n", l);
  return 0;
}

Reply via email to