Never mind ... I cancel the previous suggestion. After further reading on RNGs, I believe we should abandon the use of lrand48() and implement our own RNG based on the so-called KISS family of RNGs originally proposed by the late George Marsaglia. In his excellent paper, "Good Practice in (Pseudo) Random Number Generation for Bioinformatics Applications", David Jones (UCL Bioinformatics Group) describes a few variants of KISS generators. This paper, and Robert G. Brown's (Duke Univ.) comprehensive "Dieharder" random number test suite work show that KISS RNGs are simple and fast, yet high quality.
Something like JLKISS64(), with state kept in TLS, would be ideal for DPDK use. In limited experiments, I found JLKISS64() (not inlined, compiles to ~40 instructions) to be ~4 times faster than rte_rand(). This is probably because JLKISS64() achieves integer-instruction parallelism, while rte_rand(), with its two calls to lrand48(), nrand48_r(), and __drand48_iterate(), does not (and all those calls!). Here is the JLKISS64() function, as it appears in Jones's GoodPracticeRNG paper: --- /* Public domain code for JLKISS64 RNG - long period KISS RNG producing 64-bit results */ unsigned long long x = 123456789123ULL,y = 987654321987ULL; /* Seed variables */ unsigned int z1 = 43219876, c1 = 6543217, z2 = 21987643, c2 = 1732654; /* Seed variables */ unsigned long long JLKISS64() { unsigned long long t; x = 1490024343005336237ULL * x + 123456789; y ^= y << 21; y ^= y >> 17; y ^= y << 30; /* Do not set y=0! */ t = 4294584393ULL * z1 + c1; c1 = t >> 32; z1 = t; t = 4246477509ULL * z2 + c2; c2 = t >> 32; z2 = t; return x + y + z1 + ((unsigned long long)z2 << 32); /* Return 64-bit result */ } -- Regards, Robert >The implementation of rte_rand() returns only 62 bits of >pseudo-randomness, because the underlying calls to lrand48() >"return non-negative long integers uniformly distributed >between 0 and 2^31." > >We have written a potential fix, but before we spend more >time testing and refining it, I wanted to check with you >guys. > >We switched to using the reentrant versions of [ls]rand48, >and maintain per-lcore state. We need ~2.06 calls to >lrand48_r(), per call to rte_rand(). > >Do you agree with the approach we've taken in this patch? > > >Thanks, >Robert > >--- > lib/librte_eal/common/include/rte_random.h | 33 >+++++++++++++++++++++++++-- > lib/librte_eal/linuxapp/eal/eal_thread.c | 7 ++++++ > 2 files changed, 37 insertions(+), 3 deletions(-) > >diff --git a/lib/librte_eal/common/include/rte_random.h >b/lib/librte_eal/common/include/rte_random.h >index 24ae836..b9248cd 100644 >--- a/lib/librte_eal/common/include/rte_random.h >+++ b/lib/librte_eal/common/include/rte_random.h >@@ -46,6 +46,17 @@ extern "C" { > > #include <stdint.h> > #include <stdlib.h> >+#include <rte_per_lcore.h> >+#include <rte_branch_prediction.h> >+ >+struct rte_rand_data { >+ struct drand48_data _dr48; >+ uint32_t _hi_bits; >+ uint8_t _bits_left; >+}; >+ >+RTE_DECLARE_PER_LCORE(struct rte_rand_data, _rand_data); >+ > > /** > * Seed the pseudo-random generator. >@@ -60,7 +71,7 @@ extern "C" { > static inline void > rte_srand(uint64_t seedval) > { >- srand48((long unsigned int)seedval); >+ srand48_r((long unsigned int)seedval, &RTE_PER_LCORE(_rand_data)._dr48); > } > > /** >@@ -76,10 +87,26 @@ rte_srand(uint64_t seedval) > static inline uint64_t > rte_rand(void) > { >+ struct rte_rand_data *rd = &RTE_PER_LCORE(_rand_data); > uint64_t val; >- val = lrand48(); >+ uint32_t hi_bits; >+ long int result; >+ >+ if (unlikely(rd->_bits_left < 2)) { >+ lrand48_r(&rd->_dr48, &result); >+ rd->_hi_bits |= (uint32_t)result << (1 - rd->_bits_left); >+ rd->_bits_left += 31; >+ } >+ >+ hi_bits = rd->_hi_bits; >+ lrand48_r(&rd->_dr48, &result); >+ val = (uint32_t)result | (hi_bits & 0x8000000); > val <<= 32; >- val += lrand48(); >+ hi_bits <<= 1; >+ lrand48_r(&rd->_dr48, &result); >+ val |= (uint32_t)result | (hi_bits & 0x8000000); >+ rd->_hi_bits = hi_bits << 1; >+ rd->_bits_left -= 2; > return val; > } > >diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c >b/lib/librte_eal/linuxapp/eal/eal_thread.c >index 5635c7d..08e7f72 100644 >--- a/lib/librte_eal/linuxapp/eal/eal_thread.c >+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c >@@ -52,6 +52,8 @@ > #include <rte_eal.h> > #include <rte_per_lcore.h> > #include <rte_lcore.h> >+#include <rte_cycles.h> >+#include <rte_random.h> > > #include "eal_private.h" > #include "eal_thread.h" >@@ -59,6 +61,8 @@ > RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; > RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; > RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); >+RTE_DEFINE_PER_LCORE(struct rte_rand_data, _rand_data); >+ > > /* > * Send a message to a slave lcore identified by slave_id to call a >@@ -147,6 +151,9 @@ eal_thread_loop(__attribute__((unused)) void *arg) > /* set the lcore ID in per-lcore memory area */ > RTE_PER_LCORE(_lcore_id) = lcore_id; > >+ /* seed per-lcore PRNG */ >+ rte_srand(rte_rdtsc()); >+ > /* set CPU affinity */ > if (eal_thread_set_affinity() < 0) > rte_panic("cannot set affinity\n"); >-- >1.7.1 >