On 12/06/17 14:16, Wilco Dijkstra wrote: > The Cortex-A53 scheduler model of FMAC bypass is not quite right > for FMAC to FMAC forwarding. Experiments also show the latencies of > FP operations are too high as well. Rather than adding more bypasses, > adjust the latencies of FP instructions to get a better schedule on > average. As a result SPECFP2006 is 1.1% faster. > > Passes AArch64 and ARM bootstrap and regress. > > ChangeLog: > 2017-05-30 Wilco Dijkstra <wdijk...@arm.com> > > * config/arm/cortex-a53.md (cortex_a53_fpalu) Adjust latency. > (cortex_a53_fconst): Likewise. > (cortex_a53_fpmul): Likewise. > (cortex_a53_f_load_64): Likewise. > (cortex_a53_f_load_many): Likewise. > (cortex_a53_advsimd_alu): Likewise. > (cortex_a53_advsimd_alu_q): Likewise. > (cortex_a53_advsimd_mul): Likewise. > (cortex_a53_advsimd_mul_q): Likewise. > (fpmac bypass): Add new bypass for fpmac-fpmac case. > Add missing fmul, r2f_cvt and fconst cases.
OK. R. > -- > diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md > index > 403e84b5d4c3993c06ee879d147f9c5eb6dd3b9a..49f80d3130f97777260e9d3c6d4f37ef3db20c94 > 100644 > --- a/gcc/config/arm/cortex-a53.md > +++ b/gcc/config/arm/cortex-a53.md > @@ -503,19 +503,19 @@ (define_cpu_unit "cortex_a53_crypto" > ;; Floating-point arithmetic. > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > -(define_insn_reservation "cortex_a53_fpalu" 5 > +(define_insn_reservation "cortex_a53_fpalu" 4 > (and (eq_attr "tune" "cortexa53") > (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov, > f_cvt, fcmps, fcmpd, fccmps, fccmpd, fcsel, > f_rints, f_rintd, f_minmaxs, f_minmaxd")) > "cortex_a53_slot_any,cortex_a53_fp_alu") > > -(define_insn_reservation "cortex_a53_fconst" 3 > +(define_insn_reservation "cortex_a53_fconst" 2 > (and (eq_attr "tune" "cortexa53") > (eq_attr "type" "fconsts,fconstd")) > "cortex_a53_slot_any,cortex_a53_fp_alu") > > -(define_insn_reservation "cortex_a53_fpmul" 5 > +(define_insn_reservation "cortex_a53_fpmul" 4 > (and (eq_attr "tune" "cortexa53") > (eq_attr "type" "fmuls,fmuld")) > "cortex_a53_slot_any,cortex_a53_fp_mul") > @@ -566,14 +566,14 @@ (define_insn_reservation "cortex_a53_f_flags" 5 > ;; Floating-point load/store. > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > -(define_insn_reservation "cortex_a53_f_load_64" 4 > +(define_insn_reservation "cortex_a53_f_load_64" 3 > (and (eq_attr "tune" "cortexa53") > (ior (eq_attr "type" "f_loads,f_loadd") > (eq_attr "cortex_a53_advsimd_type" > "advsimd_load_64"))) > "cortex_a53_slot_any+cortex_a53_ls_agen") > > -(define_insn_reservation "cortex_a53_f_load_many" 5 > +(define_insn_reservation "cortex_a53_f_load_many" 4 > (and (eq_attr "tune" "cortexa53") > (eq_attr "cortex_a53_advsimd_type" > "advsimd_load_128,advsimd_load_lots")) > @@ -604,22 +604,22 @@ (define_insn_reservation "cortex_a53_f_store_many" 0 > ;; or a 128-bit operation in which case we require in our model that we > ;; issue from slot 0. > > -(define_insn_reservation "cortex_a53_advsimd_alu" 5 > +(define_insn_reservation "cortex_a53_advsimd_alu" 4 > (and (eq_attr "tune" "cortexa53") > (eq_attr "cortex_a53_advsimd_type" "advsimd_alu")) > "cortex_a53_slot_any,cortex_a53_fp_alu") > > -(define_insn_reservation "cortex_a53_advsimd_alu_q" 5 > +(define_insn_reservation "cortex_a53_advsimd_alu_q" 4 > (and (eq_attr "tune" "cortexa53") > (eq_attr "cortex_a53_advsimd_type" "advsimd_alu_q")) > "cortex_a53_slot0,cortex_a53_fp_alu_q") > > -(define_insn_reservation "cortex_a53_advsimd_mul" 5 > +(define_insn_reservation "cortex_a53_advsimd_mul" 4 > (and (eq_attr "tune" "cortexa53") > (eq_attr "cortex_a53_advsimd_type" "advsimd_mul")) > "cortex_a53_slot_any,cortex_a53_fp_mul") > > -(define_insn_reservation "cortex_a53_advsimd_mul_q" 5 > +(define_insn_reservation "cortex_a53_advsimd_mul_q" 4 > (and (eq_attr "tune" "cortexa53") > (eq_attr "cortex_a53_advsimd_type" "advsimd_mul_q")) > "cortex_a53_slot0,cortex_a53_fp_mul_q") > @@ -698,20 +698,18 @@ (define_insn_reservation "cortex_a53_crypto_sha_slow" 5 > ;; multiply-accumulate operations as a bypass reducing the latency > ;; of producing instructions to near zero. > > -(define_bypass 1 "cortex_a53_fp*, > +(define_bypass 1 "cortex_a53_fpalu, > + cortex_a53_fpmul, > cortex_a53_r2f, > + cortex_a53_r2f_cvt, > + cortex_a53_fconst, > cortex_a53_f_load*" > "cortex_a53_fpmac" > "aarch_accumulator_forwarding") > > -;; Model a bypass from the result of an FP operation to a use. > - > -(define_bypass 4 "cortex_a53_fpalu, > - cortex_a53_fpmul" > - "cortex_a53_fpalu, > - cortex_a53_fpmul, > - cortex_a53_fpmac, > - cortex_a53_advsimd_div*") > +(define_bypass 4 "cortex_a53_fpmac" > + "cortex_a53_fpmac" > + "aarch_accumulator_forwarding") > > ;; We want AESE and AESMC to end up consecutive to one another. > >