On Thu, 26 May 2022 15:20:29 +0200 Mattias Rönnblom <hof...@lysator.liu.se> wrote:
> On 2022-05-25 22:31, Stephen Hemminger wrote: > > The PIE code and other applications can benefit from having a > > fast way to get a random floating point value. This new function > > is equivalent to drand() in the standard library. > > > > Signed-off-by: Stephen Hemminger <step...@networkplumber.org> > > --- > > app/test/test_rand_perf.c | 7 +++++ > > doc/guides/rel_notes/release_22_07.rst | 5 ++++ > > lib/eal/common/rte_random.c | 41 ++++++++++++++++++++++++++ > > lib/eal/include/rte_random.h | 18 +++++++++++ > > lib/eal/meson.build | 3 ++ > > lib/eal/version.map | 1 + > > 6 files changed, 75 insertions(+) > > > > diff --git a/app/test/test_rand_perf.c b/app/test/test_rand_perf.c > > index fe797ebfa1ca..26fb1d9a586e 100644 > > --- a/app/test/test_rand_perf.c > > +++ b/app/test/test_rand_perf.c > > @@ -20,6 +20,7 @@ static volatile uint64_t vsum; > > > > enum rand_type { > > rand_type_64, > > + rand_type_float, > > rand_type_bounded_best_case, > > rand_type_bounded_worst_case > > }; > > @@ -30,6 +31,8 @@ rand_type_desc(enum rand_type rand_type) > > switch (rand_type) { > > case rand_type_64: > > return "Full 64-bit [rte_rand()]"; > > + case rand_type_float: > > + return "Floating point [rte_drand()]"; > > case rand_type_bounded_best_case: > > return "Bounded average best-case [rte_rand_max()]"; > > case rand_type_bounded_worst_case: > > @@ -55,6 +58,9 @@ test_rand_perf_type(enum rand_type rand_type) > > case rand_type_64: > > sum += rte_rand(); > > break; > > + case rand_type_float: > > + sum += 1000. * rte_drand(); > > Including this floating point multiplication will lead to an > overestimation of rte_drand() latency. > > You could refactor this function to be a macro, and pass the return type > to as a parameter to this macro. I did just that, and on both an AMD > 5900X and a Cortex-A72 it didn't add more than ~5%, so I don't think > it's necessary. > > > + break; > > case rand_type_bounded_best_case: > > sum += rte_rand_max(BEST_CASE_BOUND); > > break; > > @@ -83,6 +89,7 @@ test_rand_perf(void) > > printf("Pseudo-random number generation latencies:\n"); > > > > test_rand_perf_type(rand_type_64); > > + test_rand_perf_type(rand_type_float); > > test_rand_perf_type(rand_type_bounded_best_case); > > test_rand_perf_type(rand_type_bounded_worst_case); > > > > diff --git a/doc/guides/rel_notes/release_22_07.rst > > b/doc/guides/rel_notes/release_22_07.rst > > index e49cacecefd4..b131ea577226 100644 > > --- a/doc/guides/rel_notes/release_22_07.rst > > +++ b/doc/guides/rel_notes/release_22_07.rst > > @@ -104,6 +104,11 @@ New Features > > * ``RTE_EVENT_QUEUE_ATTR_WEIGHT`` > > * ``RTE_EVENT_QUEUE_ATTR_AFFINITY`` > > > > +* ** Added function get random floating point number.** > > + > > + Added the function ``rte_drand()`` to provide a pseudo-random > > + floating point number. > > + > > > > Removed Items > > ------------- > > diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c > > index 4535cc980cec..3dc3484ee655 100644 > > --- a/lib/eal/common/rte_random.c > > +++ b/lib/eal/common/rte_random.c > > @@ -6,6 +6,9 @@ > > #include <x86intrin.h> > > #endif > > #include <unistd.h> > > +#ifdef RTE_LIBEAL_USE_IEEE754 > > +#include <ieee754.h> > > +#endif > > > > #include <rte_branch_prediction.h> > > #include <rte_cycles.h> > > @@ -173,6 +176,44 @@ rte_rand_max(uint64_t upper_bound) > > return res; > > } > > > > +double > > +rte_drand(void) > > +{ > > + struct rte_rand_state *state = __rte_rand_get_state(); > > + uint64_t rand64 = __rte_rand_lfsr258(state); > > +#ifdef RTE_LIBEAL_USE_IEEE754 > > + union ieee754_double u = { > > + .ieee = { > > + .negative = 0, > > + .exponent = IEEE754_DOUBLE_BIAS, > > + }, > > + }; > > + > > + /* Take 64 bit random value and put it into the mantissa > > + * This uses direct access to IEEE format to avoid doing > > + * any direct floating point math here. > > + */ > > + u.ieee.mantissa0 = rand64 >> 32; > > + u.ieee.mantissa1 = rand64; > > + > > + return u.d - 1.0; > > +#else > > + /* Slower method requiring floating point divide > > + * > > Do you know how much slower? I ran rand_perf_test on two of my systems. > > AMD 5900X Pi4 (ARM Cortex-A72) > IEEE754 version 12 1.19 > Non-IEEE754 version 11 1.16 > Naive version* 24 1.16 > > * (double)rte_rand() / (double)UINT64_MAX > > Numbers are TSC cycles/op. On AMD Ryzen 7 both versions take 9 cycles/op with the rand_perf_autotest So it is a toss up. The 754 version is: ubfx r1, r1, #0, #20 orr r3, r1, #1069547520 << mantissa0 mov r2, r0 orr r3, r3, #3145728 vmov.f64 d0, #1.0e+0 vmov d16, r2, r3 vsub.f64 d0, d16, d0 << return u.d - 1.0 Note: the compiler is doing smart optimization on the divide version. It knows that since denominator is fixed value it can use multiply. vmov d16, r0, r1 vmul.f64 d0, d16, d0