On June 9, 2017 5:32:10 PM GMT+02:00, Christophe Lyon <christophe.l...@linaro.org> wrote: >On 8 June 2017 at 15:49, Richard Biener <rguent...@suse.de> wrote: >> On Thu, 8 Jun 2017, Richard Biener wrote: >> >>> >>> The following fixes unsafe vectorization of reductions in outer loop >>> vectorization. It combines it with a bit of cleanup in the affected >>> function. >>> >>> Bootstrap and regtest running on x86_64-unknown-linux-gnu. >> >> Re-testing the following variant -- multiple loop-closed PHI nodes >> for the same SSA name happen. >> >> Richard. >> >> 2017-06-08 Richard Biener <rguent...@suse.de> >> >> PR tree-optimization/66623 >> * tree-vect-loop.c (vect_is_simple_reduction): Cleanup, >> refactor check_reduction into two parts, properly computing >> whether we have to check reduction validity for outer loop >> vectorization. >> >> * gcc.dg/vect/pr66623.c: New testcase. >> > >Hi Richard, > >The new testcase fails on armeb* targets: >FAIL: gcc.dg/vect/pr66623.c -flto -ffat-lto-objects execution test >FAIL: gcc.dg/vect/pr66623.c execution test
Ah, the xfail probably isn't enough. Can you try a dg-skip or so? >Christophe > > >> Index: gcc/tree-vect-loop.c >> =================================================================== >> --- gcc/tree-vect-loop.c (revision 249007) >> +++ gcc/tree-vect-loop.c (working copy) >> @@ -2727,8 +2727,6 @@ vect_is_simple_reduction (loop_vec_info >> { >> struct loop *loop = (gimple_bb (phi))->loop_father; >> struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); >> - edge latch_e = loop_latch_edge (loop); >> - tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); >> gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = >NULL; >> enum tree_code orig_code, code; >> tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; >> @@ -2742,11 +2740,6 @@ vect_is_simple_reduction (loop_vec_info >> *double_reduc = false; >> *v_reduc_type = TREE_CODE_REDUCTION; >> >> - /* Check validity of the reduction only for the innermost loop. >*/ >> - bool check_reduction = ! flow_loop_nested_p (vect_loop, loop); >> - gcc_assert ((check_reduction && loop == vect_loop) >> - || (!check_reduction && flow_loop_nested_p (vect_loop, >loop))); >> - >> name = PHI_RESULT (phi); >> /* ??? If there are no uses of the PHI result the inner loop >reduction >> won't be detected as possibly double-reduction by >vectorizable_reduction >> @@ -2775,13 +2768,15 @@ vect_is_simple_reduction (loop_vec_info >> { >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> - "reduction used in loop.\n"); >> + "reduction value used in loop.\n"); >> return NULL; >> } >> >> phi_use_stmt = use_stmt; >> } >> >> + edge latch_e = loop_latch_edge (loop); >> + tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); >> if (TREE_CODE (loop_arg) != SSA_NAME) >> { >> if (dump_enabled_p ()) >> @@ -2795,18 +2790,22 @@ vect_is_simple_reduction (loop_vec_info >> } >> >> def_stmt = SSA_NAME_DEF_STMT (loop_arg); >> - if (!def_stmt) >> + if (gimple_nop_p (def_stmt)) >> { >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> - "reduction: no def_stmt.\n"); >> + "reduction: no def_stmt\n"); >> return NULL; >> } >> >> if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != >GIMPLE_PHI) >> { >> if (dump_enabled_p ()) >> - dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0); >> + { >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> + "reduction: unhandled reduction operation: >"); >> + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, >def_stmt, 0); >> + } >> return NULL; >> } >> >> @@ -2822,6 +2821,7 @@ vect_is_simple_reduction (loop_vec_info >> } >> >> nloop_uses = 0; >> + auto_vec<gphi *, 3> lcphis; >> FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) >> { >> gimple *use_stmt = USE_STMT (use_p); >> @@ -2829,6 +2829,9 @@ vect_is_simple_reduction (loop_vec_info >> continue; >> if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) >> nloop_uses++; >> + else >> + /* We can have more than one loop-closed PHI. */ >> + lcphis.safe_push (as_a <gphi *> (use_stmt)); >> if (nloop_uses > 1) >> { >> if (dump_enabled_p ()) >> @@ -2873,6 +2876,27 @@ vect_is_simple_reduction (loop_vec_info >> return NULL; >> } >> >> + /* If we are vectorizing an inner reduction we are executing that >> + in the original order only in case we are not dealing with a >> + double reduction. */ >> + bool check_reduction = true; >> + if (flow_loop_nested_p (vect_loop, loop)) >> + { >> + gphi *lcphi; >> + unsigned i; >> + check_reduction = false; >> + FOR_EACH_VEC_ELT (lcphis, i, lcphi) >> + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result >(lcphi)) >> + { >> + gimple *use_stmt = USE_STMT (use_p); >> + if (is_gimple_debug (use_stmt)) >> + continue; >> + if (! flow_bb_inside_loop_p (vect_loop, gimple_bb >(use_stmt))) >> + check_reduction = true; >> + } >> + } >> + >> + bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); >> code = orig_code = gimple_assign_rhs_code (def_stmt); >> >> /* We can handle "res -= x[i]", which is non-associative by >> @@ -2887,27 +2911,8 @@ vect_is_simple_reduction (loop_vec_info >> >> if (code == COND_EXPR) >> { >> - if (check_reduction) >> + if (! nested_in_vect_loop) >> *v_reduc_type = COND_REDUCTION; >> - } >> - else if (!commutative_tree_code (code) || !associative_tree_code >(code)) >> - { >> - if (dump_enabled_p ()) >> - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, >> - "reduction: not commutative/associative: "); >> - return NULL; >> - } >> - >> - if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS) >> - { >> - if (code != COND_EXPR) >> - { >> - if (dump_enabled_p ()) >> - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, >> - "reduction: not binary operation: "); >> - >> - return NULL; >> - } >> >> op3 = gimple_assign_rhs1 (def_stmt); >> if (COMPARISON_CLASS_P (op3)) >> @@ -2918,30 +2923,35 @@ vect_is_simple_reduction (loop_vec_info >> >> op1 = gimple_assign_rhs2 (def_stmt); >> op2 = gimple_assign_rhs3 (def_stmt); >> - >> - if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != >SSA_NAME) >> - { >> - if (dump_enabled_p ()) >> - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, >> - "reduction: uses not ssa_names: "); >> - >> - return NULL; >> - } >> } >> - else >> + else if (!commutative_tree_code (code) || !associative_tree_code >(code)) >> + { >> + if (dump_enabled_p ()) >> + report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, >> + "reduction: not commutative/associative: "); >> + return NULL; >> + } >> + else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) >> { >> op1 = gimple_assign_rhs1 (def_stmt); >> op2 = gimple_assign_rhs2 (def_stmt); >> + } >> + else >> + { >> + if (dump_enabled_p ()) >> + report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, >> + "reduction: not handled operation: "); >> + return NULL; >> + } >> >> - if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != >SSA_NAME) >> - { >> - if (dump_enabled_p ()) >> - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, >> - "reduction: uses not ssa_names: "); >> + if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) >> + { >> + if (dump_enabled_p ()) >> + report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, >> + "reduction: both uses not ssa_names: "); >> >> - return NULL; >> - } >> - } >> + return NULL; >> + } >> >> type = TREE_TYPE (gimple_assign_lhs (def_stmt)); >> if ((TREE_CODE (op1) == SSA_NAME >> @@ -3091,7 +3101,7 @@ vect_is_simple_reduction (loop_vec_info >> == vect_internal_def >> && !is_loop_header_bb_p (gimple_bb (def2))))))) >> { >> - if (check_reduction && orig_code != MINUS_EXPR) >> + if (! nested_in_vect_loop && orig_code != MINUS_EXPR) >> { >> /* Check if we can swap operands (just for simplicity - so >that >> the rest of the code can assume that the reduction >variable >> @@ -3145,7 +3155,8 @@ vect_is_simple_reduction (loop_vec_info >> } >> >> /* Try to find SLP reduction chain. */ >> - if (check_reduction && code != COND_EXPR >> + if (! nested_in_vect_loop >> + && code != COND_EXPR >> && vect_is_slp_reduction (loop_info, phi, def_stmt)) >> { >> if (dump_enabled_p ()) >> Index: gcc/testsuite/gcc.dg/vect/pr66623.c >> =================================================================== >> --- gcc/testsuite/gcc.dg/vect/pr66623.c (revision 0) >> +++ gcc/testsuite/gcc.dg/vect/pr66623.c (working copy) >> @@ -0,0 +1,86 @@ >> +/* { dg-require-effective-target vect_float } */ >> + >> +#include "tree-vect.h" >> + >> +extern void abort (void); >> + >> +#define OP * >> +#define INIT 1.0 >> + >> +float __attribute__((noinline,noclone)) >> +foo (float *__restrict__ i) >> +{ >> + float l = INIT; >> + int a; >> + int b; >> + >> + for (a = 0; a < 4; a++) >> + for (b = 0; b < 4; b++) >> + l = l OP i[b]; >> + >> + return l; >> +} >> + >> +float __attribute__((noinline,noclone)) >> +foo_ref (float *__restrict__ i) >> +{ >> + float l = INIT; >> + >> + l = l OP i[0]; >> + l = l OP i[1]; >> + l = l OP i[2]; >> + l = l OP i[3]; >> + >> + l = l OP i[0]; >> + l = l OP i[1]; >> + l = l OP i[2]; >> + l = l OP i[3]; >> + >> + l = l OP i[0]; >> + l = l OP i[1]; >> + l = l OP i[2]; >> + l = l OP i[3]; >> + >> + l = l OP i[0]; >> + l = l OP i[1]; >> + l = l OP i[2]; >> + l = l OP i[3]; >> + >> + return l; >> +} >> + >> +union u >> +{ >> + float f; >> + unsigned int u; >> +}; >> + >> +int >> +main (void) >> +{ >> + union u res, res2; >> + float a[4]; >> + >> + if (sizeof (float) != sizeof (unsigned int)) >> + return 0; >> + >> + check_vect (); >> + >> + a[0] = 0.01; >> + a[1] = 0.01; >> + a[2] = 0.01; >> + a[3] = 1.0; >> + >> + res.f = foo_ref (a); >> + >> + res2.f = foo (a); >> + >> + if (res.u != res2.u) >> + abort (); >> + >> + return 0; >> +} >> + >> +/* need -ffast-math to vectorize this loop. */ >> +/* ARM NEON passes -ffast-math to these tests, so expect this to >fail. */ >> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { >xfail arm_neon_ok } } } */