Performance results for fp-bench: 1. Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz - before: cmp-single: 113.01 MFlops cmp-double: 115.54 MFlops - after: cmp-single: 527.83 MFlops cmp-double: 457.21 MFlops
2. ARM Aarch64 A57 @ 2.4GHz - before: cmp-single: 39.32 MFlops cmp-double: 39.80 MFlops - after: cmp-single: 162.74 MFlops cmp-double: 167.08 MFlops 3. IBM POWER8E @ 2.1 GHz - before: cmp-single: 60.81 MFlops cmp-double: 62.76 MFlops - after: cmp-single: 235.39 MFlops cmp-double: 283.44 MFlops Here using float{32,64}_is_any_nan is faster than using isnan for all machines. On x86_64 the perf difference is just a few percentage points, but on aarch64 we go from 117/119 to 164/169 MFlops for single/double precision, respectively. Aggregate performance improvement for the last few patches: [ all charts in png: https://imgur.com/a/4yV8p ] 1. Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz qemu-aarch64 NBench score; higher is better Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz 16 +-+-----------+-------------+----===-------+---===-------+-----------+-+ 14 +-+..........................@@@&&.=.......@@@&&.=...................+-+ 12 +-+..........................@.@.&.=.......@.@.&.=.....+befor=== +-+ 10 +-+..........................@.@.&.=.......@.@.&.=.....+ad@@&& = +-+ 8 +-+.......................$$$%.@.&.=.......@.@.&.=.....+ @@u& = +-+ 6 +-+............@@@&&=+***##.$%.@.&.=***##$$%+@.&.=..###$$%%@i& = +-+ 4 +-+.......###$%%.@.&=.*.*.#.$%.@.&.=*.*.#.$%.@.&.=+**.#+$ +@m& = +-+ 2 +-+.....***.#$.%.@.&=.*.*.#.$%.@.&.=*.*.#.$%.@.&.=.**.#+$+sqr& = +-+ 0 +-+-----***##$%%@@&&=-***##$$%@@&&==***##$$%@@&&==-**##$$%+cmp==-----+-+ FOURIER NEURAL NELU DECOMPOSITION gmean qemu-aarch64 SPEC06fp (test set) speedup over QEMU 4c2c1015905 Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz error bars: 95% confidence interval 4.5 +-+---+-----+----+-----+-----+-&---+-----+----+-----+-----+-----+----+-----+-----+-----+-----+----+-----+---+-+ 4 +-+..........................+@@+...........................................................................+-+ 3.5 +-+..............%%@&.........@@..............%%@&............................................+++dsub +-+ 2.5 +-+....&&+.......%%@&.......+%%@..+%%&+..@@&+.%%@&....................................+%%&+.+%@&++%%@& +-+ 2 +-+..+%%&..+%@&+.%%@&...+++..%%@...%%&.+$$@&..%%@&..%%@&.......+%%&+.%%@&+......+%%@&.+%%&++$$@&++d%@& %%@&+-+ 1.5 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&*+f%@&**$%@&+-+ 0.5 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&+sqr@&**$%@&+-+ 0 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&*+cmp&**$%@&+-+ 410.bw416.gam433.434.z435.436.cac437.lesli444.447.de450.so453454.ca459.GemsF465.tont470.lb4482.sphinxgeomean 2. Host: ARM Aarch64 A57 @ 2.4GHz qemu-aarch64 NBench score; higher is better Host: Applied Micro X-Gene, Aarch64 A57 @ 2.4 GHz 5 +-+-----------+-------------+-------------+-------------+-----------+-+ 4.5 +-+........................................@@@&==...................+-+ 3 4 +-+..........................@@@&==........@.@&.=.....+before +-+ 3 +-+..........................@.@&.=........@.@&.=.....+ad@@@&== +-+ 2.5 +-+.....................##$$%%.@&.=........@.@&.=.....+ @m@& = +-+ 2 +-+............@@@&==.***#.$.%.@&.=.***#$$%%.@&.=.***#$$%%d@& = +-+ 1.5 +-+.....***#$$%%.@&.=.*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#+$ +f@& = +-+ 0.5 +-+.....*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#+$+sqr& = +-+ 0 +-+-----***#$$%%@@&==-***#$$%%@@&==-***#$$%%@@&==-***#$$%+cmp==-----+-+ FOURIER NEURAL NLU DECOMPOSITION gmean Note that by not inlining the soft-fp primitives we end up with a smaller softfloat.o--in particular, see the difference for the softfloat.o built for fp-bench: - before this series: text data bss dec hex filename 103235 0 0 103235 19343 softfloat.o - after: text data bss dec hex filename 93369 0 0 93369 16cb9 softfloat.o Signed-off-by: Emilio G. Cota <c...@braap.org> --- fpu/softfloat.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index f89e872..1cf74d1 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2671,28 +2671,74 @@ static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, } } -#define COMPARE(sz) \ -int float ## sz ## _compare(float ## sz a, float ## sz b, \ - float_status *s) \ +#define COMPARE(name, attr, sz) \ +static int attr \ +name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ { \ FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ - return compare_floats(pa, pb, false, s); \ -} \ -int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ - float_status *s) \ -{ \ - FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ - FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ - return compare_floats(pa, pb, true, s); \ + return compare_floats(pa, pb, is_quiet, s); \ } -COMPARE(16) -COMPARE(32) -COMPARE(64) +COMPARE(soft_float16_compare, , 16) +COMPARE(soft_float32_compare, QEMU_SOFTFLOAT_ATTR, 32) +COMPARE(soft_float64_compare, QEMU_SOFTFLOAT_ATTR, 64) #undef COMPARE +int __attribute__((flatten)) +float16_compare(float16 a, float16 b, float_status *s) +{ + return soft_float16_compare(a, b, false, s); +} + +int __attribute__((flatten)) +float16_compare_quiet(float16 a, float16 b, float_status *s) +{ + return soft_float16_compare(a, b, true, s); +} + +#define GEN_FPU_COMPARE(name, quiet_name, soft_t, host_t) \ + static int \ + fpu_ ## name(soft_t a, soft_t b, bool is_quiet, float_status *s) \ + { \ + host_t ha, hb; \ + \ + if (QEMU_NO_HARDFLOAT) { \ + return soft_ ## name(a, b, is_quiet, s); \ + } \ + soft_t ## _input_flush2(&a, &b, s); \ + ha = soft_t ## _to_ ## host_t(a); \ + hb = soft_t ## _to_ ## host_t(b); \ + if (unlikely(soft_t ## _is_any_nan(a) || \ + soft_t ## _is_any_nan(b))) { \ + return soft_ ## name(a, b, is_quiet, s); \ + } \ + if (isgreater(ha, hb)) { \ + return float_relation_greater; \ + } \ + if (isless(ha, hb)) { \ + return float_relation_less; \ + } \ + return float_relation_equal; \ + } \ + \ + int __attribute__((flatten)) \ + name(soft_t a, soft_t b, float_status *s) \ + { \ + return fpu_ ## name(a, b, false, s); \ + } \ + \ + int __attribute__((flatten)) \ + quiet_name(soft_t a, soft_t b, float_status *s) \ + { \ + return fpu_ ## name(a, b, true, s); \ + } + +GEN_FPU_COMPARE(float32_compare, float32_compare_quiet, float32, float) +GEN_FPU_COMPARE(float64_compare, float64_compare_quiet, float64, double) +#undef GEN_FPU_COMPARE + /* Multiply A by 2 raised to the power N. */ static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) { -- 2.7.4