I noticed vect_analyze_slp didn't try SLP reduction when it detected any reduction chain. That's because the LOOP_VINFO_REDUCTIONS array contains also the detected chains -- but a reduction chain can only be vectorized as reduction chain (well, I'm going to fix that! I just ran into this code in this process).
The following rectifies this by properly not putting reduction chains onto LOOP_VINFO_REDUCTIONS, simplifying vect_analyze_slp thereby. The testcase is now vectorized with full SLP, for v4si that's an unroll factor of only 2 compared to previously where we used interelaving with unroll factor 4 and two reductions to process the remaining SLP reduction (and used SLP for the reduction chain). Bootstrap and regtest running on x86_64-unknown-linux-gnu. Richard. 2017-06-29 Richard Biener <rguent...@suse.de> * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Do not add reduction chains to LOOP_VINFO_REDUCTIONS. * tree-vect-slp.c (vect_analyze_slp): Continue looking for SLP reductions after processing reduction chains. * gcc.dg/vect/slp-reduc-8.c: New testcase. Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c (revision 249729) +++ gcc/tree-vect-loop.c (working copy) @@ -890,8 +895,10 @@ vect_analyze_scalar_cycles_1 (loop_vec_i STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = vect_reduction_def; /* Store the reduction cycles for possible vectorization in - loop-aware SLP. */ - LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt); + loop-aware SLP if it was not detected as reduction + chain. */ + if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt))) + LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt); } } } Index: gcc/tree-vect-slp.c =================================================================== --- gcc/tree-vect-slp.c (revision 249729) +++ gcc/tree-vect-slp.c (working copy) @@ -2102,15 +2103,13 @@ vect_analyze_slp (vec_info *vinfo, unsig { unsigned int i; gimple *first_element; - bool ok = false; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n"); /* Find SLP sequences starting from groups of grouped stores. */ FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) - if (vect_analyze_slp_instance (vinfo, first_element, max_tree_size)) - ok = true; + vect_analyze_slp_instance (vinfo, first_element, max_tree_size); if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) { @@ -2118,22 +2117,15 @@ vect_analyze_slp (vec_info *vinfo, unsig { /* Find SLP sequences starting from reduction chains. */ FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) - if (vect_analyze_slp_instance (vinfo, first_element, + if (! vect_analyze_slp_instance (vinfo, first_element, max_tree_size)) - ok = true; - else - return false; - - /* Don't try to vectorize SLP reductions if reduction chain was - detected. */ - return ok; + return false; } /* Find SLP sequences starting from groups of reductions. */ - if (loop_vinfo->reductions.length () > 1 - && vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0], - max_tree_size)) - ok = true; + if (loop_vinfo->reductions.length () > 1) + vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0], + max_tree_size); } return true; Index: gcc/testsuite/gcc.dg/vect/slp-reduc-8.c =================================================================== --- gcc/testsuite/gcc.dg/vect/slp-reduc-8.c (nonexistent) +++ gcc/testsuite/gcc.dg/vect/slp-reduc-8.c (working copy) @@ -0,0 +1,48 @@ +/* { dg-require-effective-target vect_int } */ + +#include "tree-vect.h" + +static int a[512], b[512]; + +void __attribute__((noinline,noclone)) +foo (int *sum1p, int *sum2p, int *sum3p) +{ + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + /* Check that we vectorize a reduction chain and a SLP reduction + at the same time. */ + for (int i = 0; i < 256; ++i) + { + sum1 += a[2*i]; + sum1 += a[2*i + 1]; + sum2 += b[2*i]; + sum3 += b[2*i + 1]; + } + *sum1p = sum1; + *sum2p = sum2; + *sum3p = sum3; +} + +int main() +{ + check_vect (); + + for (int i = 0; i < 256; ++i) + { + a[2*i] = i; + a[2*i + 1] = i/2; + b[2*i] = i + 1; + b[2*i + 1] = i/2 + 1; + __asm__ volatile ("" : : : "memory"); + } + int sum1, sum2, sum3; + foo (&sum1, &sum2, &sum3); + if (sum1 != 48896 || sum2 != 32896 || sum3 != 16512) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */ +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */