Emilio G. Cota <c...@braap.org> writes: > This will allow us to measure the performance impact of FP > emulation optimizations. > > Signed-off-by: Emilio G. Cota <c...@braap.org> > --- > tests/fp-bench.c | 290 > +++++++++++++++++++++++++++++++++++++++++++++++++ > tests/.gitignore | 1 + > tests/Makefile.include | 3 +- > 3 files changed, 293 insertions(+), 1 deletion(-) > create mode 100644 tests/fp-bench.c > > diff --git a/tests/fp-bench.c b/tests/fp-bench.c > new file mode 100644 > index 0000000..a782093 > --- /dev/null > +++ b/tests/fp-bench.c > @@ -0,0 +1,290 @@ > +/* > + * fp-bench.c - A collection of simple floating point microbenchmarks. > + * > + * Copyright (C) 2018, Emilio G. Cota <c...@braap.org> > + * > + * License: GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + */ > +#include "qemu/osdep.h" > +#include "qemu/atomic.h" > + > +#include <math.h> > + > +#include <sys/time.h> > +#include <stdint.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <stdio.h> > +#include <time.h> > + > +/* amortize the computation of random inputs */ > +#define OPS_PER_ITER (1000ULL) > + > +#define SEED_A 0xdeadfacedeadface > +#define SEED_B 0xbadc0feebadc0fee > +#define SEED_C 0xbeefdeadbeefdead > + > +enum op { > + OP_ADD, > + OP_SUB, > + OP_MUL, > + OP_DIV, > + OP_FMA, > + OP_SQRT, > +}; > + > +static const char * const op_names[] = { > + [OP_ADD] = "add", > + [OP_SUB] = "sub", > + [OP_MUL] = "mul", > + [OP_DIV] = "div", > + [OP_FMA] = "fma", > + [OP_SQRT] = "sqrt", > +}; > + > +static uint64_t n_ops = 10000000; > +static enum op op; > +static const char *precision = "float"; > + > +static const char commands_string[] = > + " -n = number of floating point operations\n" > + " -o = floating point operation (add, sub, mul, div, fma, sqrt). > Default: add\n" > + " -p = precision (float|single, double). Default: float"; > + > +static void usage_complete(int argc, char *argv[]) > +{ > + fprintf(stderr, "Usage: %s [options]\n", argv[0]); > + fprintf(stderr, "options:\n%s\n", commands_string); > + exit(-1); > +} > + > +static void set_op(const char *name) > +{ > + int i; > + > + for (i = 0; i < ARRAY_SIZE(op_names); i++) { > + if (strcmp(name, op_names[i]) == 0) { > + op = i; > + return; > + } > + } > + fprintf(stderr, "Unsupported op '%s'\n", name); > + exit(EXIT_FAILURE); > +} > + > +static inline int64_t get_clock_realtime(void) > +{ > + struct timeval tv; > + > + gettimeofday(&tv, NULL); > + return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000); > +} > + > +/* > + * From: https://en.wikipedia.org/wiki/Xorshift > + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only > + * guaranteed to be >= INT_MAX). > + */ > +static uint64_t xorshift64star(uint64_t x) > +{ > + x ^= x >> 12; /* a */ > + x ^= x << 25; /* b */ > + x ^= x >> 27; /* c */ > + return x * UINT64_C(2685821657736338717); > +} > + > +static inline bool u32_is_normal(uint32_t x) > +{ > + return ((x + 0x00800000) & 0x7fffffff) >= 0x01000000; > +} > + > +static inline bool u64_is_normal(uint64_t x) > +{ > + return ((x + (1ULL << 52)) & -1ULL >> 1) >= 1ULL << 53; > +} > + > +static inline float get_random_float(uint64_t *x) > +{ > + uint64_t r = *x; > + uint32_t r32; > + > + do { > + r = xorshift64star(r); > + } while (!u32_is_normal(r)); > + *x = r; > + r32 = r; > + return *(float *)&r32; > +} > + > +static inline double get_random_double(uint64_t *x) > +{ > + uint64_t r = *x; > + > + do { > + r = xorshift64star(r); > + } while (!u64_is_normal(r)); > + *x = r; > + return *(double *)&r; > +} > + > +/* > + * Disable optimizations (e.g. "a OP b" outside of the inner loop) with > + * volatile. > + */ > +#define GEN_BENCH_1OPF(NAME, FUNC, PRECISION) \ > + static void NAME(volatile PRECISION *res) \ > + { \ > + uint64_t ra = SEED_A; \ > + uint64_t i, j; \ > + \ > + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \ > + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \ > + \ > + for (j = 0; j < OPS_PER_ITER; j++) { \ > + *res = FUNC(a); \ > + } \ > + } \ > + } > +
Have you had a chance to look at if this will vectorise? I have a similar benchmark which I compile with multiple options to test normal, NEON/AdvSIMD and SVE enabled loops. > +GEN_BENCH_1OPF(bench_float_sqrt, sqrtf, float) > +GEN_BENCH_1OPF(bench_double_sqrt, sqrt, double) > +#undef GEN_BENCH_1OPF > + > +#define GEN_BENCH_2OP(NAME, OP, PRECISION) \ > + static void NAME(volatile PRECISION *res) \ > + { \ > + uint64_t ra = SEED_A; \ > + uint64_t rb = SEED_B; \ > + uint64_t i, j; \ > + \ > + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \ > + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \ > + volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \ > + \ > + for (j = 0; j < OPS_PER_ITER; j++) { \ > + *res = a OP b; \ > + } \ > + } \ > + } > + > +GEN_BENCH_2OP(bench_float_add, +, float) > +GEN_BENCH_2OP(bench_float_sub, -, float) > +GEN_BENCH_2OP(bench_float_mul, *, float) > +GEN_BENCH_2OP(bench_float_div, /, float) > + > +GEN_BENCH_2OP(bench_double_add, +, double) > +GEN_BENCH_2OP(bench_double_sub, -, double) > +GEN_BENCH_2OP(bench_double_mul, *, double) > +GEN_BENCH_2OP(bench_double_div, /, double) > + > +#define GEN_BENCH_3OPF(NAME, FUNC, PRECISION) \ > + static void NAME(volatile PRECISION *res) \ > + { \ > + uint64_t ra = SEED_A; \ > + uint64_t rb = SEED_B; \ > + uint64_t rc = SEED_C; \ > + uint64_t i, j; \ > + \ > + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \ > + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \ > + volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \ > + volatile PRECISION c = glue(get_random_, PRECISION)(&rc); \ > + \ > + for (j = 0; j < OPS_PER_ITER; j++) { \ > + *res = FUNC(a, b, c); \ > + } \ > + } \ > + } > + > +GEN_BENCH_3OPF(bench_float_fma, fmaf, float) > +GEN_BENCH_3OPF(bench_double_fma, fma, double) > +#undef GEN_BENCH_3OPF > + > +static void parse_args(int argc, char *argv[]) > +{ > + int c; > + > + for (;;) { > + c = getopt(argc, argv, "n:ho:p:"); > + if (c < 0) { > + break; > + } > + switch (c) { > + case 'h': > + usage_complete(argc, argv); > + exit(0); > + case 'n': > + n_ops = atoll(optarg); > + if (n_ops < OPS_PER_ITER) { > + n_ops = OPS_PER_ITER; > + } > + n_ops -= n_ops % OPS_PER_ITER; > + break; > + case 'o': > + set_op(optarg); > + break; > + case 'p': > + precision = optarg; > + if (strcmp(precision, "float") && > + strcmp(precision, "single") && > + strcmp(precision, "double")) { > + fprintf(stderr, "Unsupported precision '%s'\n", precision); > + exit(EXIT_FAILURE); Supporting half-precision if the compiler does would also be useful here. > + } > + break; > + } > + } > +} > + > +#define CALL_BENCH(OP, PRECISION, RESP) \ > + do { \ > + switch (OP) { \ > + case OP_ADD: \ > + glue(glue(bench_, PRECISION), _add)(RESP); \ > + break; \ > + case OP_SUB: \ > + glue(glue(bench_, PRECISION), _sub)(RESP); \ > + break; \ > + case OP_MUL: \ > + glue(glue(bench_, PRECISION), _mul)(RESP); \ > + break; \ > + case OP_DIV: \ > + glue(glue(bench_, PRECISION), _div)(RESP); \ > + break; \ > + case OP_FMA: \ > + glue(glue(bench_, PRECISION), _fma)(RESP); \ > + break; \ > + case OP_SQRT: \ > + glue(glue(bench_, PRECISION), _sqrt)(RESP); \ > + break; \ > + default: \ > + g_assert_not_reached(); \ > + } \ > + } while (0) > + > +int main(int argc, char *argv[]) > +{ > + int64_t t0, t1; > + double resd; > + > + parse_args(argc, argv); > + if (!strcmp(precision, "float") || !strcmp(precision, "single")) { > + float res; > + t0 = get_clock_realtime(); > + CALL_BENCH(op, float, &res); > + t1 = get_clock_realtime(); > + resd = res; > + } else if (!strcmp(precision, "double")) { > + t0 = get_clock_realtime(); > + CALL_BENCH(op, double, &resd); > + t1 = get_clock_realtime(); > + } else { > + g_assert_not_reached(); > + } > + printf("%.2f MFlops\n", (double)n_ops / (t1 - t0) * 1e3); > + if (resd) { > + return 0; > + } > + return 0; > +} > diff --git a/tests/.gitignore b/tests/.gitignore > index 18e58b2..df69175 100644 > --- a/tests/.gitignore > +++ b/tests/.gitignore > @@ -12,6 +12,7 @@ check-qobject > check-qstring > check-qom-interface > check-qom-proplist > +fp-bench > qht-bench > rcutorture > test-aio > diff --git a/tests/Makefile.include b/tests/Makefile.include > index ef9b88c..f6121ee 100644 > --- a/tests/Makefile.include > +++ b/tests/Makefile.include > @@ -587,7 +587,7 @@ test-obj-y = tests/check-qnum.o tests/check-qstring.o > tests/check-qdict.o \ > tests/rcutorture.o tests/test-rcu-list.o \ > tests/test-qdist.o tests/test-shift128.o \ > tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \ > - tests/atomic_add-bench.o > + tests/atomic_add-bench.o tests/fp-bench.o Not sure why but "make check" didn't build this. I had to explicitly "make tests/fp-bench". I guess along with atomic_add_bench though these are explicitly guest facing tests so maybe we should move them once tests/tcg is working again. I'll have another run at that this week. > > $(test-obj-y): QEMU_INCLUDES += -Itests > QEMU_CFLAGS += -I$(SRC_PATH)/tests > @@ -639,6 +639,7 @@ tests/test-qht-par$(EXESUF): tests/test-qht-par.o > tests/qht-bench$(EXESUF) $(tes > tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y) > tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o > $(test-util-obj-y) > tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y) > +tests/fp-bench$(EXESUF): tests/fp-bench.o $(test-util-obj-y) > > tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \ > hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\ Anyway for this version: Reviewed-by: Alex Bennée <alex.ben...@linaro.org> -- Alex Bennée