On Thu, Mar 22, 2018 at 13:05:00 +0800, Richard Henderson wrote: > On 03/22/2018 04:11 AM, Emilio G. Cota wrote: > > +#define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t, \ > > + host_abs_func, min_normal) \ > > + static inline __attribute__((always_inline)) soft_t \ > > + fpu_ ## soft_t ## _addsub(soft_t a, soft_t b, bool subtract, \ > > + float_status *s) \ > > + { \ > > + soft_t ## _input_flush2(&a, &b, s); \ > > + if (likely((soft_t ## _is_normal(a) || soft_t ## _is_zero(a)) && \ > > + (soft_t ## _is_normal(b) || soft_t ## _is_zero(b)) && \ > > + s->float_exception_flags & float_flag_inexact && \ > > + s->float_rounding_mode == float_round_nearest_even)) { \ > > + host_t ha = soft_t ## _to_ ## host_t(a); \ > > + host_t hb = soft_t ## _to_ ## host_t(b); \ > > + host_t hr; \ > > + soft_t r; \ > > + \ > > + if (subtract) { \ > > + hb = -hb; \ > > + } \ > > + hr = ha + hb; \ > > + r = host_t ## _to_ ## soft_t(hr); \ > > + if (unlikely(soft_t ## _is_infinity(r))) { \ > > + s->float_exception_flags |= float_flag_overflow; \ > > + } else if (unlikely(host_abs_func(hr) <= min_normal)) { \ > > + goto soft; \ > > + } \ > > + return r; \ > > + } \ > > + soft: \ > > Is there any especially good reason you want to not put this code into the > normal softfloat function? Does it really many any measurable difference at > all to force this code to be inlined into a helper?
You mean to do this? (... or see below) --- a/fpu/hostfloat.c +++ b/fpu/hostfloat.c @@ -97,7 +97,7 @@ GEN_INPUT_FLUSH(float64) #define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t, \ host_abs_func, min_normal) \ - static inline __attribute__((always_inline)) soft_t \ + static soft_t \ fpu_ ## soft_t ## _addsub(soft_t a, soft_t b, bool subtract, \ float_status *s) \ { \ That slows add/sub dramatically, because addsub is not inlined into float32_add and float32_sub (that's an extra function call plus an extra branch per emulated op). For x86_64-linux-user/qemu-x86_64 tests/fp-bench -o add, the above gives - before: 188.06 MFlops - after: 117.56 MFlops ... or did you miss that fpu_##soft_t##_addsub is only called by soft_t ## _add / sub, which are standalone? That is: +#define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t, \ + host_abs_func, min_normal) \ (...) + soft_t add_name(soft_t a, soft_t b, float_status *status) \ + { \ + return fpu_ ## soft_t ## _addsub(a, b, false, status); \ + } \ + \ + soft_t sub_name(soft_t a, soft_t b, float_status *status) \ + { \ + return fpu_ ## soft_t ## _addsub(a, b, true, status); \ + } + +GEN_FPU_ADDSUB(float32_add, float32_sub, float32, float, fabsf, FLT_MIN) +GEN_FPU_ADDSUB(float64_add, float64_sub, float64, double, fabs, DBL_MIN) +#undef GEN_FPU_ADDSUB Note that add/sub_name expand to float32/64_add/sub; I like having the full names in the macro to ease grepping. Thanks, E.