For the testcase in question which uses a fold-left vectorized reduction of a reverse iterating loop we'd need two forwprop invocations to first bypass the permute emitted for the reverse iterating loop and then to decompose the vector load that only feeds element extracts. The following moves the first transform to a match.pd pattern and makes sure we fold the element extracts when the vectorizer emits them so the single forwprop pass can then pick up the vector load decomposition, avoiding the forwarding fail that causes.
Moving simplify_bitfield_ref also makes forwprop remove the dead VEC_PERM_EXPR via the simple-dce it uses - this was also previously missing. Bootstrapped and tested on x86_64-unknown-linux-gnu, OK? Thanks, Richard. PR tree-optimization/90579 * tree-ssa-forwprop.cc (simplify_bitfield_ref): Move to match.pd. (pass_forwprop::execute): Adjust. * match.pd (bit_field_ref (vec_perm ...)): New pattern modeled after simplify_bitfield_ref. * tree-vect-loop.cc (vect_expand_fold_left): Fold the element extract stmt, combining it with the vector def. * gcc.target/i386/pr90579.c: New testcase. --- gcc/match.pd | 56 +++++++++++++ gcc/testsuite/gcc.target/i386/pr90579.c | 23 ++++++ gcc/tree-ssa-forwprop.cc | 103 +----------------------- gcc/tree-vect-loop.cc | 5 ++ 4 files changed, 85 insertions(+), 102 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr90579.c diff --git a/gcc/match.pd b/gcc/match.pd index 20b2aec6f37..ea44201f2eb 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -9538,6 +9538,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (BIT_FIELD_REF { CONSTRUCTOR_ELT (ctor, idx / const_k)->value; } @1 { bitsize_int ((idx % const_k) * width); }))))))))) +(simplify + (BIT_FIELD_REF (vec_perm@0 @1 @2 VECTOR_CST@3) @rsize @rpos) + (with + { + tree elem_type = TREE_TYPE (TREE_TYPE (@0)); + poly_uint64 elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type)); + poly_uint64 size = tree_to_poly_uint64 (TYPE_SIZE (type)); + unsigned HOST_WIDE_INT nelts, idx; + unsigned HOST_WIDE_INT nelts_op = 0; + } + (if (constant_multiple_p (tree_to_poly_uint64 (@rpos), elem_size, &idx) + && VECTOR_CST_NELTS (@3).is_constant (&nelts) + && (known_eq (size, elem_size) + || (constant_multiple_p (size, elem_size, &nelts_op) + && pow2p_hwi (nelts_op)))) + (with + { + bool ok = true; + /* One element. */ + if (known_eq (size, elem_size)) + idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (@3, idx)) % (2 * nelts); + else + { + /* Clamp vec_perm_expr index. */ + unsigned start + = TREE_INT_CST_LOW (vector_cst_elt (@3, idx)) % (2 * nelts); + unsigned end + = (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + nelts_op - 1)) + % (2 * nelts)); + /* Be in the same vector. */ + if ((start < nelts) != (end < nelts)) + ok = false; + else + for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++) + { + /* Continuous area. */ + if ((TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i)) + % (2 * nelts) - 1) + != (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i - 1)) + % (2 * nelts))) + { + ok = false; + break; + } + } + /* Alignment not worse than before. */ + if (start % nelts_op) + ok = false; + idx = start; + } + } + (if (ok) + (if (idx < nelts) + (BIT_FIELD_REF @1 @rsize { bitsize_int (idx * elem_size); }) + (BIT_FIELD_REF @2 @rsize { bitsize_int ((idx - nelts) * elem_size); }))))))) + /* Simplify a bit extraction from a bit insertion for the cases with the inserted element fully covering the extraction or the insertion not touching the extraction. */ diff --git a/gcc/testsuite/gcc.target/i386/pr90579.c b/gcc/testsuite/gcc.target/i386/pr90579.c new file mode 100644 index 00000000000..ab48a44063c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr90579.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2 -mfpmath=sse" } */ + +extern double r[6]; +extern double a[]; + +double +loop (int k, double x) +{ + int i; + double t=0; + for (i=0;i<6;i++) + r[i] = x * a[i + k]; + for (i=0;i<6;i++) + t+=r[5-i]; + return t; +} + +/* Verify we end up with scalar loads from r for the final sum. */ +/* { dg-final { scan-assembler "vaddsd\tr\\\+40" } } */ +/* { dg-final { scan-assembler "vaddsd\tr\\\+32" } } */ +/* { dg-final { scan-assembler "vaddsd\tr\\\+24" } } */ +/* { dg-final { scan-assembler "vaddsd\tr\\\+16" } } */ diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index 9474682152a..fafc4d6b77a 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq; static bool forward_propagate_addr_expr (tree, tree, bool); +static void optimize_vector_load (gimple_stmt_iterator *); /* Set to true if we delete dead edges during the optimization. */ static bool cfg_changed; @@ -2481,106 +2482,6 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) } -/* Combine an element access with a shuffle. Returns true if there were - any changes made, else it returns false. */ - -static bool -simplify_bitfield_ref (gimple_stmt_iterator *gsi) -{ - gimple *stmt = gsi_stmt (*gsi); - gimple *def_stmt; - tree op, op0, op1; - tree elem_type, type; - tree p, m, tem; - unsigned HOST_WIDE_INT nelts, idx; - poly_uint64 size, elem_size; - enum tree_code code; - - op = gimple_assign_rhs1 (stmt); - gcc_checking_assert (TREE_CODE (op) == BIT_FIELD_REF); - - op0 = TREE_OPERAND (op, 0); - if (TREE_CODE (op0) != SSA_NAME - || TREE_CODE (TREE_TYPE (op0)) != VECTOR_TYPE) - return false; - - def_stmt = get_prop_source_stmt (op0, false, NULL); - if (!def_stmt || !can_propagate_from (def_stmt)) - return false; - - op1 = TREE_OPERAND (op, 1); - code = gimple_assign_rhs_code (def_stmt); - elem_type = TREE_TYPE (TREE_TYPE (op0)); - type = TREE_TYPE (op); - /* Also handle vector type. - .i.e. - _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>; - _11 = BIT_FIELD_REF <_7, 64, 0>; - - to - - _11 = BIT_FIELD_REF <_1, 64, 64>. */ - - size = tree_to_poly_uint64 (TYPE_SIZE (type)); - if (maybe_ne (bit_field_size (op), size)) - return false; - - elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type)); - if (code != VEC_PERM_EXPR - || !constant_multiple_p (bit_field_offset (op), elem_size, &idx)) - return false; - - m = gimple_assign_rhs3 (def_stmt); - if (TREE_CODE (m) != VECTOR_CST - || !VECTOR_CST_NELTS (m).is_constant (&nelts)) - return false; - - /* One element. */ - if (known_eq (size, elem_size)) - idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx)) % (2 * nelts); - else - { - unsigned HOST_WIDE_INT nelts_op; - if (!constant_multiple_p (size, elem_size, &nelts_op) - || !pow2p_hwi (nelts_op)) - return false; - /* Clamp vec_perm_expr index. */ - unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx)) % (2 * nelts); - unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1)) - % (2 * nelts); - /* Be in the same vector. */ - if ((start < nelts) != (end < nelts)) - return false; - for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++) - { - /* Continuous area. */ - if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) % (2 * nelts) - 1 - != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1)) - % (2 * nelts)) - return false; - } - /* Alignment not worse than before. */ - if (start % nelts_op) - return false; - idx = start; - } - - if (idx < nelts) - p = gimple_assign_rhs1 (def_stmt); - else - { - p = gimple_assign_rhs2 (def_stmt); - idx -= nelts; - } - - tem = build3 (BIT_FIELD_REF, TREE_TYPE (op), - p, op1, bitsize_int (idx * elem_size)); - gimple_assign_set_rhs1 (stmt, tem); - fold_stmt (gsi); - update_stmt (gsi_stmt (*gsi)); - return true; -} - /* Determine whether applying the 2 permutations (mask1 then mask2) gives back one of the input. */ @@ -4677,8 +4578,6 @@ pass_forwprop::execute (function *fun) cfg_changed = true; changed = did_something != 0; } - else if (code == BIT_FIELD_REF) - changed = simplify_bitfield_ref (&gsi); else if (code == CONSTRUCTOR && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE) changed = simplify_vector_constructor (&gsi); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index eea0b89db69..07b19a22d42 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -7086,6 +7086,11 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, rhs = make_ssa_name (scalar_dest, stmt); gimple_assign_set_lhs (stmt, rhs); gsi_insert_before (gsi, stmt, GSI_SAME_STMT); + /* Fold the vector extract, combining it with a previous reversal + like seen in PR90579. */ + auto gsi2 = gsi_for_stmt (stmt); + if (fold_stmt (&gsi2, follow_all_ssa_edges)) + update_stmt (gsi_stmt (gsi2)); stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); tree new_name = make_ssa_name (scalar_dest, stmt); -- 2.43.0