On Wed, Feb 12, 2025 at 6:58 AM Richard Biener <rguent...@suse.de> wrote: > > For the testcase in question which uses a fold-left vectorized > reduction of a reverse iterating loop we'd need two forwprop > invocations to first bypass the permute emitted for the reverse > iterating loop and then to decompose the vector load that only > feeds element extracts. The following moves the first transform > to a match.pd pattern and makes sure we fold the element extracts > when the vectorizer emits them so the single forwprop pass can > then pick up the vector load decomposition, avoiding the forwarding > fail that causes. > > Moving simplify_bitfield_ref also makes forwprop remove the dead > VEC_PERM_EXPR via the simple-dce it uses - this was also > previously missing. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
LGTM > > Thanks, > Richard. > > PR tree-optimization/90579 > * tree-ssa-forwprop.cc (simplify_bitfield_ref): Move to > match.pd. > (pass_forwprop::execute): Adjust. > * match.pd (bit_field_ref (vec_perm ...)): New pattern > modeled after simplify_bitfield_ref. > * tree-vect-loop.cc (vect_expand_fold_left): Fold the > element extract stmt, combining it with the vector def. > > * gcc.target/i386/pr90579.c: New testcase. > --- > gcc/match.pd | 56 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr90579.c | 23 ++++++ > gcc/tree-ssa-forwprop.cc | 103 +----------------------- > gcc/tree-vect-loop.cc | 5 ++ > 4 files changed, 85 insertions(+), 102 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr90579.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 20b2aec6f37..ea44201f2eb 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -9538,6 +9538,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (BIT_FIELD_REF { CONSTRUCTOR_ELT (ctor, idx / const_k)->value; } > @1 { bitsize_int ((idx % const_k) * width); }))))))))) > > +(simplify > + (BIT_FIELD_REF (vec_perm@0 @1 @2 VECTOR_CST@3) @rsize @rpos) > + (with > + { > + tree elem_type = TREE_TYPE (TREE_TYPE (@0)); > + poly_uint64 elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type)); > + poly_uint64 size = tree_to_poly_uint64 (TYPE_SIZE (type)); > + unsigned HOST_WIDE_INT nelts, idx; > + unsigned HOST_WIDE_INT nelts_op = 0; > + } > + (if (constant_multiple_p (tree_to_poly_uint64 (@rpos), elem_size, &idx) > + && VECTOR_CST_NELTS (@3).is_constant (&nelts) > + && (known_eq (size, elem_size) > + || (constant_multiple_p (size, elem_size, &nelts_op) > + && pow2p_hwi (nelts_op)))) > + (with > + { > + bool ok = true; > + /* One element. */ > + if (known_eq (size, elem_size)) > + idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (@3, idx)) % (2 * nelts); > + else > + { > + /* Clamp vec_perm_expr index. */ > + unsigned start > + = TREE_INT_CST_LOW (vector_cst_elt (@3, idx)) % (2 * nelts); > + unsigned end > + = (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + nelts_op - 1)) > + % (2 * nelts)); > + /* Be in the same vector. */ > + if ((start < nelts) != (end < nelts)) > + ok = false; > + else > + for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++) > + { > + /* Continuous area. */ > + if ((TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i)) > + % (2 * nelts) - 1) > + != (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i - 1)) > + % (2 * nelts))) > + { > + ok = false; > + break; > + } > + } > + /* Alignment not worse than before. */ > + if (start % nelts_op) > + ok = false; > + idx = start; > + } > + } > + (if (ok) > + (if (idx < nelts) > + (BIT_FIELD_REF @1 @rsize { bitsize_int (idx * elem_size); }) > + (BIT_FIELD_REF @2 @rsize { bitsize_int ((idx - nelts) * elem_size); > }))))))) > + > /* Simplify a bit extraction from a bit insertion for the cases with > the inserted element fully covering the extraction or the insertion > not touching the extraction. */ > diff --git a/gcc/testsuite/gcc.target/i386/pr90579.c > b/gcc/testsuite/gcc.target/i386/pr90579.c > new file mode 100644 > index 00000000000..ab48a44063c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr90579.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx2 -mfpmath=sse" } */ > + > +extern double r[6]; > +extern double a[]; > + > +double > +loop (int k, double x) > +{ > + int i; > + double t=0; > + for (i=0;i<6;i++) > + r[i] = x * a[i + k]; > + for (i=0;i<6;i++) > + t+=r[5-i]; > + return t; > +} > + > +/* Verify we end up with scalar loads from r for the final sum. */ > +/* { dg-final { scan-assembler "vaddsd\tr\\\+40" } } */ > +/* { dg-final { scan-assembler "vaddsd\tr\\\+32" } } */ > +/* { dg-final { scan-assembler "vaddsd\tr\\\+24" } } */ > +/* { dg-final { scan-assembler "vaddsd\tr\\\+16" } } */ > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc > index 9474682152a..fafc4d6b77a 100644 > --- a/gcc/tree-ssa-forwprop.cc > +++ b/gcc/tree-ssa-forwprop.cc > @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq > typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq; > > static bool forward_propagate_addr_expr (tree, tree, bool); > +static void optimize_vector_load (gimple_stmt_iterator *); > > /* Set to true if we delete dead edges during the optimization. */ > static bool cfg_changed; > @@ -2481,106 +2482,6 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator > *gsi) > } > > > -/* Combine an element access with a shuffle. Returns true if there were > - any changes made, else it returns false. */ > - > -static bool > -simplify_bitfield_ref (gimple_stmt_iterator *gsi) > -{ > - gimple *stmt = gsi_stmt (*gsi); > - gimple *def_stmt; > - tree op, op0, op1; > - tree elem_type, type; > - tree p, m, tem; > - unsigned HOST_WIDE_INT nelts, idx; > - poly_uint64 size, elem_size; > - enum tree_code code; > - > - op = gimple_assign_rhs1 (stmt); > - gcc_checking_assert (TREE_CODE (op) == BIT_FIELD_REF); > - > - op0 = TREE_OPERAND (op, 0); > - if (TREE_CODE (op0) != SSA_NAME > - || TREE_CODE (TREE_TYPE (op0)) != VECTOR_TYPE) > - return false; > - > - def_stmt = get_prop_source_stmt (op0, false, NULL); > - if (!def_stmt || !can_propagate_from (def_stmt)) > - return false; > - > - op1 = TREE_OPERAND (op, 1); > - code = gimple_assign_rhs_code (def_stmt); > - elem_type = TREE_TYPE (TREE_TYPE (op0)); > - type = TREE_TYPE (op); > - /* Also handle vector type. > - .i.e. > - _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>; > - _11 = BIT_FIELD_REF <_7, 64, 0>; > - > - to > - > - _11 = BIT_FIELD_REF <_1, 64, 64>. */ > - > - size = tree_to_poly_uint64 (TYPE_SIZE (type)); > - if (maybe_ne (bit_field_size (op), size)) > - return false; > - > - elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type)); > - if (code != VEC_PERM_EXPR > - || !constant_multiple_p (bit_field_offset (op), elem_size, &idx)) > - return false; > - > - m = gimple_assign_rhs3 (def_stmt); > - if (TREE_CODE (m) != VECTOR_CST > - || !VECTOR_CST_NELTS (m).is_constant (&nelts)) > - return false; > - > - /* One element. */ > - if (known_eq (size, elem_size)) > - idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx)) % (2 * nelts); > - else > - { > - unsigned HOST_WIDE_INT nelts_op; > - if (!constant_multiple_p (size, elem_size, &nelts_op) > - || !pow2p_hwi (nelts_op)) > - return false; > - /* Clamp vec_perm_expr index. */ > - unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx)) % (2 * > nelts); > - unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - > 1)) > - % (2 * nelts); > - /* Be in the same vector. */ > - if ((start < nelts) != (end < nelts)) > - return false; > - for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++) > - { > - /* Continuous area. */ > - if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) % (2 * nelts) - 1 > - != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1)) > - % (2 * nelts)) > - return false; > - } > - /* Alignment not worse than before. */ > - if (start % nelts_op) > - return false; > - idx = start; > - } > - > - if (idx < nelts) > - p = gimple_assign_rhs1 (def_stmt); > - else > - { > - p = gimple_assign_rhs2 (def_stmt); > - idx -= nelts; > - } > - > - tem = build3 (BIT_FIELD_REF, TREE_TYPE (op), > - p, op1, bitsize_int (idx * elem_size)); > - gimple_assign_set_rhs1 (stmt, tem); > - fold_stmt (gsi); > - update_stmt (gsi_stmt (*gsi)); > - return true; > -} > - > /* Determine whether applying the 2 permutations (mask1 then mask2) > gives back one of the input. */ > > @@ -4677,8 +4578,6 @@ pass_forwprop::execute (function *fun) > cfg_changed = true; > changed = did_something != 0; > } > - else if (code == BIT_FIELD_REF) > - changed = simplify_bitfield_ref (&gsi); > else if (code == CONSTRUCTOR > && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE) > changed = simplify_vector_constructor (&gsi); > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index eea0b89db69..07b19a22d42 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -7086,6 +7086,11 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree > scalar_dest, > rhs = make_ssa_name (scalar_dest, stmt); > gimple_assign_set_lhs (stmt, rhs); > gsi_insert_before (gsi, stmt, GSI_SAME_STMT); > + /* Fold the vector extract, combining it with a previous reversal > + like seen in PR90579. */ > + auto gsi2 = gsi_for_stmt (stmt); > + if (fold_stmt (&gsi2, follow_all_ssa_edges)) > + update_stmt (gsi_stmt (gsi2)); > > stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); > tree new_name = make_ssa_name (scalar_dest, stmt); > -- > 2.43.0