For the new testcase we end up using an indermediate value of the reduction chain as reduction result. This can be easily supported by generating epilogues for (poossibly multiple) intermediate values.
For this to work the following relaxes cycle detection to allow out-of-loop uses plus it makes sure vectorizable_live_operation picks it up as reduction. Finally some invariants process_use checks are no longer true. Nicely simple ;) Bootstrap & regtest running on x86_64-unknown-linux-gnu. Richard. 2019-10-28 Richard Biener <rguent...@suse.de> PR tree-optimization/65930 * tree-vect-loop.c (check_reduction_path): Relax single-use check allowing out-of-loop uses. (vect_is_simple_reduction): SLP reduction chains cannot have intermediate stmts used outside of the loop. (vect_create_epilog_for_reduction): The adjustment might need to be converted. (vectorizable_reduction): Annotate live stmts of the reduction chain with STMT_VINFO_REDUC_DEF. * tree-vect-stms.c (process_use): Remove no longer true asserts. * gcc.dg/vect/pr65930-1.c: New testcase. Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c (revision 277517) +++ gcc/tree-vect-loop.c (working copy) @@ -2678,8 +2678,7 @@ pop: { gimple *use_stmt = USE_STMT (path[i].second); tree op = USE_FROM_PTR (path[i].second); - if (! has_single_use (op) - || ! is_gimple_assign (use_stmt) + if (! is_gimple_assign (use_stmt) /* The following make sure we can compute the operand index easily plus it mostly disallows chaining via COND_EXPR condition operands. */ @@ -2690,6 +2689,20 @@ pop: fail = true; break; } + /* Check there's only a single stmt the op is used on inside + of the loop. */ + imm_use_iterator imm_iter; + gimple *op_use_stmt; + unsigned cnt = 0; + FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op) + if (!is_gimple_debug (op_use_stmt) + && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))) + cnt++; + if (cnt != 1) + { + fail = true; + break; + } enum tree_code use_code = gimple_assign_rhs_code (use_stmt); if (use_code == MINUS_EXPR) { @@ -2922,7 +2935,10 @@ vect_is_simple_reduction (loop_vec_info for (i = path.length () - 1; i >= 1; --i) { gimple *stmt = USE_STMT (path[i].second); - if (gimple_assign_rhs_code (stmt) != code) + if (gimple_assign_rhs_code (stmt) != code + /* We can only handle the final value in epilogue + generation for reduction chains. */ + || (i != 1 && !has_single_use (gimple_assign_lhs (stmt)))) is_slp_reduc = false; stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt); STMT_VINFO_REDUC_IDX (stmt_info) @@ -4119,11 +4135,11 @@ vect_create_epilog_for_reduction (stmt_v stmt_vec_info phi_info; gimple_stmt_iterator exit_gsi; tree vec_dest; - tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; + tree new_temp = NULL_TREE, new_name, new_scalar_dest; gimple *epilog_stmt = NULL; gimple *exit_phi; tree bitsize; - tree expr, def; + tree def; tree orig_name, scalar_result; imm_use_iterator imm_iter, phi_imm_iter; use_operand_p use_p, phi_use_p; @@ -5048,25 +5064,26 @@ vect_create_epilog_for_reduction (stmt_v if (adjustment_def) { gcc_assert (!slp_reduc); + gimple_seq stmts = NULL; if (nested_in_vect_loop) { new_phi = new_phis[0]; - gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); - expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); - new_dest = vect_create_destination_var (scalar_dest, vectype); + gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); + adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); + new_temp = gimple_build (&stmts, code, vectype, + PHI_RESULT (new_phi), adjustment_def); } else { new_temp = scalar_results[0]; gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); - expr = build2 (code, scalar_type, new_temp, adjustment_def); - new_dest = vect_create_destination_var (scalar_dest, scalar_type); + adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def); + new_temp = gimple_build (&stmts, code, scalar_type, + new_temp, adjustment_def); } - epilog_stmt = gimple_build_assign (new_dest, expr); - new_temp = make_ssa_name (new_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + epilog_stmt = gimple_seq_last_stmt (stmts); + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); if (nested_in_vect_loop) { stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt); @@ -5742,6 +5759,10 @@ vectorizable_reduction (stmt_vec_info st } if (!REDUC_GROUP_FIRST_ELEMENT (def)) only_slp_reduc_chain = false; + /* ??? For epilogue generation live members of the chain need + to point back to the PHI for info_for_reduction to work. */ + if (STMT_VINFO_LIVE_P (def)) + STMT_VINFO_REDUC_DEF (def) = phi_info; reduc_def = gimple_op (def->stmt, 1 + STMT_VINFO_REDUC_IDX (def)); reduc_chain_length++; } Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 277517) +++ gcc/tree-vect-stmts.c (working copy) @@ -457,7 +457,6 @@ process_use (stmt_vec_info stmt_vinfo, t bool force) { stmt_vec_info dstmt_vinfo; - basic_block bb, def_bb; enum vect_def_type dt; /* case 1: we are only interested in uses that need to be vectorized. Uses @@ -473,28 +472,8 @@ process_use (stmt_vec_info stmt_vinfo, t if (!dstmt_vinfo) return opt_result::success (); - def_bb = gimple_bb (dstmt_vinfo->stmt); - - /* case 2: A reduction phi (STMT) defined by a reduction stmt (DSTMT_VINFO). - DSTMT_VINFO must have already been processed, because this should be the - only way that STMT, which is a reduction-phi, was put in the worklist, - as there should be no other uses for DSTMT_VINFO in the loop. So we just - check that everything is as expected, and we are done. */ - bb = gimple_bb (stmt_vinfo->stmt); - if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI - && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def - && gimple_code (dstmt_vinfo->stmt) != GIMPLE_PHI - && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def - && bb->loop_father == def_bb->loop_father) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "reduc-stmt defining reduc-phi in the same nest.\n"); - gcc_assert (STMT_VINFO_RELEVANT (dstmt_vinfo) < vect_used_by_reduction); - gcc_assert (STMT_VINFO_LIVE_P (dstmt_vinfo) - || STMT_VINFO_RELEVANT (dstmt_vinfo) > vect_unused_in_scope); - return opt_result::success (); - } + basic_block def_bb = gimple_bb (dstmt_vinfo->stmt); + basic_block bb = gimple_bb (stmt_vinfo->stmt); /* case 3a: outer-loop stmt defining an inner-loop stmt: outer-loop-header-bb: Index: gcc/testsuite/gcc.dg/vect/pr65930-1.c =================================================================== --- gcc/testsuite/gcc.dg/vect/pr65930-1.c (nonexistent) +++ gcc/testsuite/gcc.dg/vect/pr65930-1.c (working copy) @@ -0,0 +1,26 @@ +/* { dg-require-effective-target vect_int } */ + +#include "tree-vect.h" + +unsigned __attribute__((noipa)) +bar (unsigned int *x) +{ + int sum = 4; + x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__); + for (int i = 0; i < 16; ++i) + sum += x[i]; + return sum; +} + +int +main() +{ + static int a[16] __attribute__((aligned(__BIGGEST_ALIGNMENT__))) + = { 1, 3, 5, 8, 9, 10, 17, 18, 23, 29, 30, 55, 42, 2, 3, 1 }; + check_vect (); + if (bar (a) != 260) + abort (); + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */