On 8 June 2017 at 15:49, Richard Biener <rguent...@suse.de> wrote: > On Thu, 8 Jun 2017, Richard Biener wrote: > >> >> The following fixes unsafe vectorization of reductions in outer loop >> vectorization. It combines it with a bit of cleanup in the affected >> function. >> >> Bootstrap and regtest running on x86_64-unknown-linux-gnu. > > Re-testing the following variant -- multiple loop-closed PHI nodes > for the same SSA name happen. > > Richard. > > 2017-06-08 Richard Biener <rguent...@suse.de> > > PR tree-optimization/66623 > * tree-vect-loop.c (vect_is_simple_reduction): Cleanup, > refactor check_reduction into two parts, properly computing > whether we have to check reduction validity for outer loop > vectorization. > > * gcc.dg/vect/pr66623.c: New testcase. >
Hi Richard, The new testcase fails on armeb* targets: FAIL: gcc.dg/vect/pr66623.c -flto -ffat-lto-objects execution test FAIL: gcc.dg/vect/pr66623.c execution test Christophe > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c (revision 249007) > +++ gcc/tree-vect-loop.c (working copy) > @@ -2727,8 +2727,6 @@ vect_is_simple_reduction (loop_vec_info > { > struct loop *loop = (gimple_bb (phi))->loop_father; > struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); > - edge latch_e = loop_latch_edge (loop); > - tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); > gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL; > enum tree_code orig_code, code; > tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; > @@ -2742,11 +2740,6 @@ vect_is_simple_reduction (loop_vec_info > *double_reduc = false; > *v_reduc_type = TREE_CODE_REDUCTION; > > - /* Check validity of the reduction only for the innermost loop. */ > - bool check_reduction = ! flow_loop_nested_p (vect_loop, loop); > - gcc_assert ((check_reduction && loop == vect_loop) > - || (!check_reduction && flow_loop_nested_p (vect_loop, loop))); > - > name = PHI_RESULT (phi); > /* ??? If there are no uses of the PHI result the inner loop reduction > won't be detected as possibly double-reduction by vectorizable_reduction > @@ -2775,13 +2768,15 @@ vect_is_simple_reduction (loop_vec_info > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "reduction used in loop.\n"); > + "reduction value used in loop.\n"); > return NULL; > } > > phi_use_stmt = use_stmt; > } > > + edge latch_e = loop_latch_edge (loop); > + tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); > if (TREE_CODE (loop_arg) != SSA_NAME) > { > if (dump_enabled_p ()) > @@ -2795,18 +2790,22 @@ vect_is_simple_reduction (loop_vec_info > } > > def_stmt = SSA_NAME_DEF_STMT (loop_arg); > - if (!def_stmt) > + if (gimple_nop_p (def_stmt)) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "reduction: no def_stmt.\n"); > + "reduction: no def_stmt\n"); > return NULL; > } > > if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI) > { > if (dump_enabled_p ()) > - dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0); > + { > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "reduction: unhandled reduction operation: "); > + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0); > + } > return NULL; > } > > @@ -2822,6 +2821,7 @@ vect_is_simple_reduction (loop_vec_info > } > > nloop_uses = 0; > + auto_vec<gphi *, 3> lcphis; > FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) > { > gimple *use_stmt = USE_STMT (use_p); > @@ -2829,6 +2829,9 @@ vect_is_simple_reduction (loop_vec_info > continue; > if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) > nloop_uses++; > + else > + /* We can have more than one loop-closed PHI. */ > + lcphis.safe_push (as_a <gphi *> (use_stmt)); > if (nloop_uses > 1) > { > if (dump_enabled_p ()) > @@ -2873,6 +2876,27 @@ vect_is_simple_reduction (loop_vec_info > return NULL; > } > > + /* If we are vectorizing an inner reduction we are executing that > + in the original order only in case we are not dealing with a > + double reduction. */ > + bool check_reduction = true; > + if (flow_loop_nested_p (vect_loop, loop)) > + { > + gphi *lcphi; > + unsigned i; > + check_reduction = false; > + FOR_EACH_VEC_ELT (lcphis, i, lcphi) > + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) > + { > + gimple *use_stmt = USE_STMT (use_p); > + if (is_gimple_debug (use_stmt)) > + continue; > + if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) > + check_reduction = true; > + } > + } > + > + bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); > code = orig_code = gimple_assign_rhs_code (def_stmt); > > /* We can handle "res -= x[i]", which is non-associative by > @@ -2887,27 +2911,8 @@ vect_is_simple_reduction (loop_vec_info > > if (code == COND_EXPR) > { > - if (check_reduction) > + if (! nested_in_vect_loop) > *v_reduc_type = COND_REDUCTION; > - } > - else if (!commutative_tree_code (code) || !associative_tree_code (code)) > - { > - if (dump_enabled_p ()) > - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, > - "reduction: not commutative/associative: "); > - return NULL; > - } > - > - if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS) > - { > - if (code != COND_EXPR) > - { > - if (dump_enabled_p ()) > - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, > - "reduction: not binary operation: "); > - > - return NULL; > - } > > op3 = gimple_assign_rhs1 (def_stmt); > if (COMPARISON_CLASS_P (op3)) > @@ -2918,30 +2923,35 @@ vect_is_simple_reduction (loop_vec_info > > op1 = gimple_assign_rhs2 (def_stmt); > op2 = gimple_assign_rhs3 (def_stmt); > - > - if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) > - { > - if (dump_enabled_p ()) > - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, > - "reduction: uses not ssa_names: "); > - > - return NULL; > - } > } > - else > + else if (!commutative_tree_code (code) || !associative_tree_code (code)) > + { > + if (dump_enabled_p ()) > + report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, > + "reduction: not commutative/associative: "); > + return NULL; > + } > + else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) > { > op1 = gimple_assign_rhs1 (def_stmt); > op2 = gimple_assign_rhs2 (def_stmt); > + } > + else > + { > + if (dump_enabled_p ()) > + report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, > + "reduction: not handled operation: "); > + return NULL; > + } > > - if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) > - { > - if (dump_enabled_p ()) > - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, > - "reduction: uses not ssa_names: "); > + if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) > + { > + if (dump_enabled_p ()) > + report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, > + "reduction: both uses not ssa_names: "); > > - return NULL; > - } > - } > + return NULL; > + } > > type = TREE_TYPE (gimple_assign_lhs (def_stmt)); > if ((TREE_CODE (op1) == SSA_NAME > @@ -3091,7 +3101,7 @@ vect_is_simple_reduction (loop_vec_info > == vect_internal_def > && !is_loop_header_bb_p (gimple_bb (def2))))))) > { > - if (check_reduction && orig_code != MINUS_EXPR) > + if (! nested_in_vect_loop && orig_code != MINUS_EXPR) > { > /* Check if we can swap operands (just for simplicity - so that > the rest of the code can assume that the reduction variable > @@ -3145,7 +3155,8 @@ vect_is_simple_reduction (loop_vec_info > } > > /* Try to find SLP reduction chain. */ > - if (check_reduction && code != COND_EXPR > + if (! nested_in_vect_loop > + && code != COND_EXPR > && vect_is_slp_reduction (loop_info, phi, def_stmt)) > { > if (dump_enabled_p ()) > Index: gcc/testsuite/gcc.dg/vect/pr66623.c > =================================================================== > --- gcc/testsuite/gcc.dg/vect/pr66623.c (revision 0) > +++ gcc/testsuite/gcc.dg/vect/pr66623.c (working copy) > @@ -0,0 +1,86 @@ > +/* { dg-require-effective-target vect_float } */ > + > +#include "tree-vect.h" > + > +extern void abort (void); > + > +#define OP * > +#define INIT 1.0 > + > +float __attribute__((noinline,noclone)) > +foo (float *__restrict__ i) > +{ > + float l = INIT; > + int a; > + int b; > + > + for (a = 0; a < 4; a++) > + for (b = 0; b < 4; b++) > + l = l OP i[b]; > + > + return l; > +} > + > +float __attribute__((noinline,noclone)) > +foo_ref (float *__restrict__ i) > +{ > + float l = INIT; > + > + l = l OP i[0]; > + l = l OP i[1]; > + l = l OP i[2]; > + l = l OP i[3]; > + > + l = l OP i[0]; > + l = l OP i[1]; > + l = l OP i[2]; > + l = l OP i[3]; > + > + l = l OP i[0]; > + l = l OP i[1]; > + l = l OP i[2]; > + l = l OP i[3]; > + > + l = l OP i[0]; > + l = l OP i[1]; > + l = l OP i[2]; > + l = l OP i[3]; > + > + return l; > +} > + > +union u > +{ > + float f; > + unsigned int u; > +}; > + > +int > +main (void) > +{ > + union u res, res2; > + float a[4]; > + > + if (sizeof (float) != sizeof (unsigned int)) > + return 0; > + > + check_vect (); > + > + a[0] = 0.01; > + a[1] = 0.01; > + a[2] = 0.01; > + a[3] = 1.0; > + > + res.f = foo_ref (a); > + > + res2.f = foo (a); > + > + if (res.u != res2.u) > + abort (); > + > + return 0; > +} > + > +/* need -ffast-math to vectorize this loop. */ > +/* ARM NEON passes -ffast-math to these tests, so expect this to fail. */ > +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { xfail > arm_neon_ok } } } */