> LGTM, but please note there are a whole lot of CMOVE issues on x86, > collected in [1] meta-bug, especially [2]. > > [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85559 > [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56309
Thanks for noting this, I've verified my patch have no impact for the test in [2]. We will keep monitoring those cmove issues. Uros Bizjak <ubiz...@gmail.com> 于2025年1月7日周二 17:34写道: > > On Tue, Jan 7, 2025 at 8:37 AM Hongyu Wang <hongyu.w...@intel.com> wrote: > > > > Hi, > > > > For later processors, the pipeline went deeper so the penalty for > > untaken branch can be larger than before. Add a new parameter > > br_mispredict_scale to describe the penalty, and adopt to > > noce_max_ifcvt_seq_cost hook to allow longer sequence to be > > converted with cmove. > > > > This improves cpu2017 544 with -Ofast -march=native for 14% on P-core > > SPR, and 8% on E-core SRF, no other regression observed. > > > > Bootstrapped & regtested on x86-64-pc-linux-gnu. > > > > OK for trunk? > > > > gcc/ChangeLog: > > > > * config/i386/i386.cc (ix86_noce_max_ifcvt_seq_cost): Adjust > > cost with ix86_tune_cost->br_mispredict_scale. > > * config/i386/i386.h (processor_costs): Add br_mispredict_scale. > > * config/i386/x86-tune-costs.h: Add new br_mispredict_scale to > > all processor_costs, in which icelake_cost/alderlake_cost > > with value COSTS_N_INSNS (2) + 3 and other processor with value > > COSTS_N_INSNS (2). > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/cmov12.c: New test. > > LGTM, but please note there are a whole lot of CMOVE issues on x86, > collected in [1] meta-bug, especially [2]. > > [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85559 > [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56309 > > Thanks, > Uros. > > > --- > > gcc/config/i386/i386.cc | 8 ++++++- > > gcc/config/i386/i386.h | 2 ++ > > gcc/config/i386/x86-tune-costs.h | 33 ++++++++++++++++++++++++++ > > gcc/testsuite/gcc.target/i386/cmov12.c | 21 ++++++++++++++++ > > 4 files changed, 63 insertions(+), 1 deletion(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/cmov12.c > > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > > index 655335e2f47..11770aa8a50 100644 > > --- a/gcc/config/i386/i386.cc > > +++ b/gcc/config/i386/i386.cc > > @@ -25088,7 +25088,13 @@ ix86_max_noce_ifcvt_seq_cost (edge e) > > return param_max_rtl_if_conversion_unpredictable_cost; > > } > > > > - return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2); > > + /* For modern machines with deeper pipeline, the penalty for branch > > + misprediction could be higher than before to reset the pipeline > > + slots. Add parameter br_mispredict_scale as a factor to describe > > + the impact of reseting the pipeline. */ > > + > > + return BRANCH_COST (true, predictable_p) > > + * ix86_tune_cost->br_mispredict_scale; > > } > > > > /* Return true if SEQ is a good candidate as a replacement for the > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index 00626d539a9..e8e528c7811 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -232,6 +232,8 @@ struct processor_costs { > > to be unrolled. */ > > const unsigned small_unroll_factor; /* Unroll factor for small loop to > > be unrolled. */ > > + const int br_mispredict_scale; /* Branch mispredict scale for ifcvt > > + threshold. */ > > }; > > > > extern const struct processor_costs *ix86_cost; > > diff --git a/gcc/config/i386/x86-tune-costs.h > > b/gcc/config/i386/x86-tune-costs.h > > index 56a09f12b94..a4a128cd5dd 100644 > > --- a/gcc/config/i386/x86-tune-costs.h > > +++ b/gcc/config/i386/x86-tune-costs.h > > @@ -137,6 +137,7 @@ struct processor_costs ix86_size_cost = {/* costs for > > tuning for size */ > > NULL, /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* Processor costs (relative to an add) */ > > @@ -248,6 +249,7 @@ struct processor_costs i386_cost = { /* 386 > > specific costs */ > > "4", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs i486_memcpy[2] = { > > @@ -360,6 +362,7 @@ struct processor_costs i486_cost = { /* 486 > > specific costs */ > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs pentium_memcpy[2] = { > > @@ -470,6 +473,7 @@ struct processor_costs pentium_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static const > > @@ -573,6 +577,7 @@ struct processor_costs lakemont_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes > > @@ -691,6 +696,7 @@ struct processor_costs pentiumpro_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs geode_memcpy[2] = { > > @@ -800,6 +806,7 @@ struct processor_costs geode_cost = { > > NULL, /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs k6_memcpy[2] = { > > @@ -912,6 +919,7 @@ struct processor_costs k6_cost = { > > "32", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* For some reason, Athlon deals better with REP prefix (relative to loops) > > @@ -1025,6 +1033,7 @@ struct processor_costs athlon_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* K8 has optimized REP instruction for medium sized blocks, but for very > > @@ -1147,6 +1156,7 @@ struct processor_costs k8_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for > > @@ -1277,6 +1287,7 @@ struct processor_costs amdfam10_cost = { > > "32", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* BDVER has optimized REP instruction for medium sized blocks, but for > > @@ -1400,6 +1411,7 @@ const struct processor_costs bdver_cost = { > > "11", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > > > @@ -1555,6 +1567,7 @@ struct processor_costs znver1_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* ZNVER2 has optimized REP instruction for medium sized blocks, but for > > @@ -1714,6 +1727,7 @@ struct processor_costs znver2_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > struct processor_costs znver3_cost = { > > @@ -1848,6 +1862,7 @@ struct processor_costs znver3_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* This table currently replicates znver3_cost table. */ > > @@ -1984,6 +1999,7 @@ struct processor_costs znver4_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* This table currently replicates znver4_cost table. */ > > @@ -2137,6 +2153,7 @@ struct processor_costs znver5_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ > > @@ -2263,6 +2280,7 @@ struct processor_costs skylake_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* icelake_cost should produce code tuned for Icelake family of CPUs. > > @@ -2391,6 +2409,7 @@ struct processor_costs icelake_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */ > > }; > > > > /* alderlake_cost should produce code tuned for alderlake family of CPUs. > > */ > > @@ -2513,6 +2532,7 @@ struct processor_costs alderlake_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2) + 3, /* Branch mispredict scale. */ > > }; > > > > /* BTVER1 has optimized REP instruction for medium sized blocks, but for > > @@ -2628,6 +2648,7 @@ const struct processor_costs btver1_cost = { > > "11", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs btver2_memcpy[2] = { > > @@ -2740,6 +2761,7 @@ const struct processor_costs btver2_cost = { > > "11", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs pentium4_memcpy[2] = { > > @@ -2851,6 +2873,7 @@ struct processor_costs pentium4_cost = { > > NULL, /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs nocona_memcpy[2] = { > > @@ -2965,6 +2988,7 @@ struct processor_costs nocona_cost = { > > NULL, /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs atom_memcpy[2] = { > > @@ -3077,6 +3101,7 @@ struct processor_costs atom_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs slm_memcpy[2] = { > > @@ -3189,6 +3214,7 @@ struct processor_costs slm_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs tremont_memcpy[2] = { > > @@ -3315,6 +3341,7 @@ struct processor_costs tremont_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > static stringop_algs intel_memcpy[2] = { > > @@ -3427,6 +3454,7 @@ struct processor_costs intel_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */ > > @@ -3544,6 +3572,7 @@ struct processor_costs lujiazui_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */ > > @@ -3659,6 +3688,7 @@ struct processor_costs yongfeng_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. > > */ > > @@ -3774,6 +3804,7 @@ struct processor_costs shijidadao_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > > > @@ -3897,6 +3928,7 @@ struct processor_costs generic_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > /* core_cost should produce code tuned for Core familly of CPUs. */ > > @@ -4025,5 +4057,6 @@ struct processor_costs core_cost = { > > "16", /* Func alignment. */ > > 4, /* Small unroll limit. */ > > 2, /* Small unroll factor. */ > > + COSTS_N_INSNS (2), /* Branch mispredict scale. */ > > }; > > > > diff --git a/gcc/testsuite/gcc.target/i386/cmov12.c > > b/gcc/testsuite/gcc.target/i386/cmov12.c > > new file mode 100644 > > index 00000000000..87de4f4117b > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/cmov12.c > > @@ -0,0 +1,21 @@ > > +/* { dg-do compile { target { ! ia32 } } } */ > > +/* { dg-options "-O2 -mavx2 -mtune=sapphirerapids" } */ > > +/* { dg-final { scan-assembler-times "cmovg" 3 } } */ > > + > > +void foo(int *a, int n, int k) > > +{ > > + int j, v; > > + > > + v = a[k - 1]; > > + while (k <= n / 2) { > > + j = k + k; > > + if ((j < n) && (a[j - 1] < a[j])) > > + j++; > > + if (v >= a[j - 1]) > > + break; > > + a[k - 1] = a[j - 1]; > > + k = j; > > + } > > + a[k - 1] = v; > > +} > > + > > -- > > 2.31.1 > >