On Fri, Mar 4, 2022 at 3:28 PM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > For parameter passing through stack, vectorized load from parm_decl > in callee may trigger serious STF issue. This is why GCC12 regresses > 50% for cray at -O2 compared to GCC11. > > The patch add an extremely large number to stmt_cost to prevent > vectorization for loads from parm_decl under very-cheap cost model, > this can at least prevent O2 regression due to STF issue, but may lose > some perf where there's no such issue(1 vector_load vs n scalar_load + > CTOR). > > No impact for SPEC2017 for both plain O2 and native O2 on ICX. > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > Ok for trunk? > > gcc/ChangeLog: > > PR target/101908 > * config/i386/i386.cc (ix86_load_maybe_stfs_p): New. > (ix86_vector_costs::add_stmt_cost): Add extra cost for > vector_load/unsigned_load which may have stall forward issue. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr101908-1.c: New test. > * gcc.target/i386/pr101908-2.c: New test. > --- > gcc/config/i386/i386.cc | 31 ++++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr101908-1.c | 12 +++++++++ > gcc/testsuite/gcc.target/i386/pr101908-2.c | 12 +++++++++ > 3 files changed, 55 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr101908-2.c > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index b2bf90576d5..3bbaaf65ea8 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -22976,6 +22976,19 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, > struct noce_if_info *if_info) > return default_noce_conversion_profitable_p (seq, if_info); > } > > +/* Return true if REF may have STF issue, otherwise false. */ > +static bool > +ix86_load_maybe_stfs_p (tree ref) > +{ > + tree addr = get_base_address (ref); > + > + if (TREE_CODE (addr) != PARM_DECL > + || !tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (addr))) > + || tree_to_uhwi (TYPE_SIZE (TREE_TYPE (addr))) <= MAX_BITS_PER_WORD) > + return false; > + return true; > +} > + > /* x86-specific vector costs. */ > class ix86_vector_costs : public vector_costs > { > @@ -23203,6 +23216,24 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > if (TREE_CODE (op) == SSA_NAME) > TREE_VISITED (op) = 0; > } > + > + /* Prevent vectorization for load from parm_decl at O2 to avoid STF issue. > + Performance may lose when there's no STF issue(1 vector_load vs n > + scalar_load + CTOR). > + TODO: both extra cost(2000) and ix86_load_maybe_stfs_p need to be fine > + tuned. */ > + if ((kind == vector_load || kind == unaligned_load) > + && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP > + && stmt_info > + && stmt_info->slp_type == pure_slp > + && stmt_info->stmt > + && gimple_assign_load_p (stmt_info->stmt) > + && ix86_load_maybe_stfs_p (gimple_assign_rhs1 (stmt_info->stmt))) > + { > + stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); > + stmt_cost += 2000; > + } > + > if (stmt_cost == -1) > stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); > > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c > b/gcc/testsuite/gcc.target/i386/pr101908-1.c > new file mode 100644 > index 00000000000..f8e0f2e26bb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-slp-details" } */ > +/* { dg-final { scan-tree-dump {(?n)add new stmt:.*MEM \<vector\(2\) > double\>} "slp2" } } */ > + > +struct X { double x[2]; }; > +typedef double v2df __attribute__((vector_size(16))); > + > +v2df __attribute__((noipa)) > +foo (struct X* x, struct X* y) > +{ > + return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] }; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c > b/gcc/testsuite/gcc.target/i386/pr101908-2.c > new file mode 100644 > index 00000000000..7f2f00cebab > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-slp-details" } */ > +/* { dg-final { scan-tree-dump-not {(?n)add new stmt:.*MEM \<vector\(2\) > double\>} "slp2" } } */ > + > +struct X { double x[2]; }; > +typedef double v2df __attribute__((vector_size(16))); > + > +v2df __attribute__((noipa)) > +foo (struct X x, struct X y) > +{ > + return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] }; > +} > -- > 2.18.1 >
-- BR, Hongtao