Please do this work upstream in glibc rather than in the corner case of DPDK.
On Fri, Mar 27, 2015 at 9:38 AM, Sanford, Robert <rsanford at akamai.com> wrote: > Never mind ... I cancel the previous suggestion. After further reading on > RNGs, I believe we should abandon the use of lrand48() and implement our > own RNG based on the so-called KISS family of RNGs originally proposed by > the late George Marsaglia. In his excellent paper, "Good Practice in > (Pseudo) Random Number Generation for Bioinformatics Applications", David > Jones (UCL Bioinformatics Group) describes a few variants of KISS > generators. This paper, and Robert G. Brown's (Duke Univ.) comprehensive > "Dieharder" random number test suite work show that KISS RNGs are simple > and fast, yet high quality. > > Something like JLKISS64(), with state kept in TLS, would be ideal for DPDK > use. In limited experiments, I found JLKISS64() (not inlined, compiles to > ~40 instructions) to be ~4 times faster than rte_rand(). This is probably > because JLKISS64() achieves integer-instruction parallelism, while > rte_rand(), with its two calls to lrand48(), nrand48_r(), and > __drand48_iterate(), does not (and all those calls!). > > Here is the JLKISS64() function, as it appears in Jones's GoodPracticeRNG > paper: > > --- > > > > > > > > > > /* Public domain code for JLKISS64 > RNG - long period KISS RNG > producing 64-bit results */ > > unsigned long long x = > 123456789123ULL,y = 987654321987ULL; /* Seed > variables */ > unsigned int z1 = 43219876, c1 = 6543217, z2 = 21987643, c2 = 1732654; /* > Seed variables */ > > unsigned long long JLKISS64() > { > > unsigned long long t; > > x = 1490024343005336237ULL * x + > 123456789; > y ^= y << 21; y ^= y >> 17; y ^= y << 30; /* Do not set y=0! */ > t = 4294584393ULL * z1 + c1; c1 = t >> 32; z1 = t; > t = 4246477509ULL * z2 + c2; c2 = t >> 32; z2 = t; > > return x + y + z1 + ((unsigned > long long)z2 << 32); /* Return 64-bit > result */ > } > > > > > > > > > -- > Regards, > Robert > > > > >The implementation of rte_rand() returns only 62 bits of > >pseudo-randomness, because the underlying calls to lrand48() > >"return non-negative long integers uniformly distributed > >between 0 and 2^31." > > > >We have written a potential fix, but before we spend more > >time testing and refining it, I wanted to check with you > >guys. > > > >We switched to using the reentrant versions of [ls]rand48, > >and maintain per-lcore state. We need ~2.06 calls to > >lrand48_r(), per call to rte_rand(). > > > >Do you agree with the approach we've taken in this patch? > > > > > >Thanks, > >Robert > > > >--- > > lib/librte_eal/common/include/rte_random.h | 33 > >+++++++++++++++++++++++++-- > > lib/librte_eal/linuxapp/eal/eal_thread.c | 7 ++++++ > > 2 files changed, 37 insertions(+), 3 deletions(-) > > > >diff --git a/lib/librte_eal/common/include/rte_random.h > >b/lib/librte_eal/common/include/rte_random.h > >index 24ae836..b9248cd 100644 > >--- a/lib/librte_eal/common/include/rte_random.h > >+++ b/lib/librte_eal/common/include/rte_random.h > >@@ -46,6 +46,17 @@ extern "C" { > > > > #include <stdint.h> > > #include <stdlib.h> > >+#include <rte_per_lcore.h> > >+#include <rte_branch_prediction.h> > >+ > >+struct rte_rand_data { > >+ struct drand48_data _dr48; > >+ uint32_t _hi_bits; > >+ uint8_t _bits_left; > >+}; > >+ > >+RTE_DECLARE_PER_LCORE(struct rte_rand_data, _rand_data); > >+ > > > > /** > > * Seed the pseudo-random generator. > >@@ -60,7 +71,7 @@ extern "C" { > > static inline void > > rte_srand(uint64_t seedval) > > { > >- srand48((long unsigned int)seedval); > >+ srand48_r((long unsigned int)seedval, > &RTE_PER_LCORE(_rand_data)._dr48); > > } > > > > /** > >@@ -76,10 +87,26 @@ rte_srand(uint64_t seedval) > > static inline uint64_t > > rte_rand(void) > > { > >+ struct rte_rand_data *rd = &RTE_PER_LCORE(_rand_data); > > uint64_t val; > >- val = lrand48(); > >+ uint32_t hi_bits; > >+ long int result; > >+ > >+ if (unlikely(rd->_bits_left < 2)) { > >+ lrand48_r(&rd->_dr48, &result); > >+ rd->_hi_bits |= (uint32_t)result << (1 - rd->_bits_left); > >+ rd->_bits_left += 31; > >+ } > >+ > >+ hi_bits = rd->_hi_bits; > >+ lrand48_r(&rd->_dr48, &result); > >+ val = (uint32_t)result | (hi_bits & 0x8000000); > > val <<= 32; > >- val += lrand48(); > >+ hi_bits <<= 1; > >+ lrand48_r(&rd->_dr48, &result); > >+ val |= (uint32_t)result | (hi_bits & 0x8000000); > >+ rd->_hi_bits = hi_bits << 1; > >+ rd->_bits_left -= 2; > > return val; > > } > > > >diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c > >b/lib/librte_eal/linuxapp/eal/eal_thread.c > >index 5635c7d..08e7f72 100644 > >--- a/lib/librte_eal/linuxapp/eal/eal_thread.c > >+++ b/lib/librte_eal/linuxapp/eal/eal_thread.c > >@@ -52,6 +52,8 @@ > > #include <rte_eal.h> > > #include <rte_per_lcore.h> > > #include <rte_lcore.h> > >+#include <rte_cycles.h> > >+#include <rte_random.h> > > > > #include "eal_private.h" > > #include "eal_thread.h" > >@@ -59,6 +61,8 @@ > > RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; > > RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; > > RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); > >+RTE_DEFINE_PER_LCORE(struct rte_rand_data, _rand_data); > >+ > > > > /* > > * Send a message to a slave lcore identified by slave_id to call a > >@@ -147,6 +151,9 @@ eal_thread_loop(__attribute__((unused)) void *arg) > > /* set the lcore ID in per-lcore memory area */ > > RTE_PER_LCORE(_lcore_id) = lcore_id; > > > >+ /* seed per-lcore PRNG */ > >+ rte_srand(rte_rdtsc()); > >+ > > /* set CPU affinity */ > > if (eal_thread_set_affinity() < 0) > > rte_panic("cannot set affinity\n"); > >-- > >1.7.1 > > > >