Currently we only handle group_size <= nunits && nunits % group_size == 0 strided SLP loads. That's overly restrictive as we can chunk group_size > nunits && group_size % nunits == 0 loads and handle all other cases by constructing the vector from scalars (as we'd do for non-SLP).
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2016-06-08 Richard Biener <rguent...@suse.de> * tree-vect-stmts.c (vectorizable_load): Remove restrictions on strided SLP loads and fall back to scalar loads in case we can't chunk them. * gcc.dg/vect/slp-43.c: New testcase. Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c (revision 237205) --- gcc/tree-vect-stmts.c (working copy) *************** vectorizable_load (gimple *stmt, gimple_ *** 6440,6456 **** } } else if (STMT_VINFO_STRIDED_P (stmt_info)) ! { ! if (grouped_load ! && slp ! && (group_size > nunits ! || nunits % group_size != 0)) ! { ! dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ! "unhandled strided group load\n"); ! return false; ! } ! } else { negative = tree_int_cst_compare (nested_in_vect_loop --- 6440,6446 ---- } } else if (STMT_VINFO_STRIDED_P (stmt_info)) ! ; else { negative = tree_int_cst_compare (nested_in_vect_loop *************** vectorizable_load (gimple *stmt, gimple_ *** 6744,6759 **** running_off = offvar; alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (first_dr)), 0); int nloads = nunits; tree ltype = TREE_TYPE (vectype); auto_vec<tree> dr_chain; if (slp) { ! nloads = nunits / group_size; ! if (group_size < nunits) ! ltype = build_vector_type (TREE_TYPE (vectype), group_size); ! else ! ltype = vectype; ! ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); /* For SLP permutation support we need to load the whole group, not only the number of vector stmts the permutation result fits in. */ --- 6734,6762 ---- running_off = offvar; alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (first_dr)), 0); int nloads = nunits; + int lnel = 1; tree ltype = TREE_TYPE (vectype); auto_vec<tree> dr_chain; if (slp) { ! if (group_size < nunits ! && nunits % group_size == 0) ! { ! nloads = nunits / group_size; ! lnel = group_size; ! ltype = build_vector_type (TREE_TYPE (vectype), group_size); ! ltype = build_aligned_type (ltype, ! TYPE_ALIGN (TREE_TYPE (vectype))); ! } ! else if (group_size >= nunits ! && group_size % nunits == 0) ! { ! nloads = 1; ! lnel = nunits; ! ltype = vectype; ! ltype = build_aligned_type (ltype, ! TYPE_ALIGN (TREE_TYPE (vectype))); ! } /* For SLP permutation support we need to load the whole group, not only the number of vector stmts the permutation result fits in. */ *************** vectorizable_load (gimple *stmt, gimple_ *** 6765,6812 **** else ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } for (j = 0; j < ncopies; j++) { - tree vec_inv; - if (nloads > 1) { ! vec_alloc (v, nloads); ! for (i = 0; i < nloads; i++) { ! tree newref, newoff; ! gimple *incr; ! newref = build2 (MEM_REF, ltype, running_off, alias_off); ! ! newref = force_gimple_operand_gsi (gsi, newref, true, ! NULL_TREE, true, ! GSI_SAME_STMT); ! CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref); ! newoff = copy_ssa_name (running_off); ! incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, ! running_off, stride_step); vect_finish_stmt_generation (stmt, incr, gsi); running_off = newoff; } - - vec_inv = build_constructor (vectype, v); - new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi); - new_stmt = SSA_NAME_DEF_STMT (new_temp); } ! else { ! new_stmt = gimple_build_assign (make_ssa_name (ltype), ! build2 (MEM_REF, ltype, ! running_off, alias_off)); ! vect_finish_stmt_generation (stmt, new_stmt, gsi); ! ! tree newoff = copy_ssa_name (running_off); ! gimple *incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, ! running_off, stride_step); ! vect_finish_stmt_generation (stmt, incr, gsi); ! ! running_off = newoff; } if (slp) --- 6768,6810 ---- else ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } + int group_el = 0; + unsigned HOST_WIDE_INT + elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); for (j = 0; j < ncopies; j++) { if (nloads > 1) + vec_alloc (v, nloads); + for (i = 0; i < nloads; i++) { ! tree this_off = build_int_cst (TREE_TYPE (alias_off), ! group_el * elsz); ! new_stmt = gimple_build_assign (make_ssa_name (ltype), ! build2 (MEM_REF, ltype, ! running_off, this_off)); ! vect_finish_stmt_generation (stmt, new_stmt, gsi); ! if (nloads > 1) ! CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, ! gimple_assign_lhs (new_stmt)); ! ! group_el += lnel; ! if (! slp ! || group_el == group_size) { ! tree newoff = copy_ssa_name (running_off); ! gimple *incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, ! running_off, stride_step); vect_finish_stmt_generation (stmt, incr, gsi); running_off = newoff; + group_el = 0; } } ! if (nloads > 1) { ! tree vec_inv = build_constructor (vectype, v); ! new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi); ! new_stmt = SSA_NAME_DEF_STMT (new_temp); } if (slp) Index: gcc/testsuite/gcc.dg/vect/slp-43.c =================================================================== *** gcc/testsuite/gcc.dg/vect/slp-43.c (revision 0) --- gcc/testsuite/gcc.dg/vect/slp-43.c (revision 0) *************** *** 0 **** --- 1,78 ---- + /* { dg-do run } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-O3" } */ + + #include <string.h> + #include "tree-vect.h" + + #define FOO(T,N) \ + void __attribute__((noinline,noclone)) \ + foo_ ## T ## _ ## N (T * __restrict__ in_, T * __restrict__ out_, int s) \ + { \ + T *in = __builtin_assume_aligned (in_, __BIGGEST_ALIGNMENT__); \ + T *out = __builtin_assume_aligned (out_, __BIGGEST_ALIGNMENT__); \ + for (int i = 0; i < 16; i++) \ + { \ + for (int j = 0; j < N; ++j) \ + out[j] = in[j]; \ + in += s*N; \ + out += N; \ + } \ + } + + #define TEST(T,N) \ + do { \ + memset (out, 0, 4096); \ + foo_ ## T ## _ ## N ((T *)in, (T *)out, 1); \ + if (memcmp (in, out, sizeof (T) * 16 * N) != 0) \ + __builtin_abort (); \ + for (int i = sizeof (T) * 16 * N; i < 4096; ++i) \ + if (out[i] != 0) \ + __builtin_abort (); \ + } while (0) + + FOO(char, 1) + FOO(char, 2) + FOO(char, 3) + FOO(char, 4) + FOO(char, 6) + FOO(char, 8) + FOO(int, 1) + FOO(int, 2) + FOO(int, 3) + FOO(int, 4) + FOO(int, 6) + FOO(int, 8) + FOO(int, 16) + + char in[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__))); + char out[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__))); + + int main() + { + check_vect (); + + for (int i = 0; i < 4096; ++i) + { + in[i] = i; + __asm__ volatile ("" : : : "memory"); + } + + TEST(char, 1); + TEST(char, 2); + TEST(char, 3); + TEST(char, 4); + TEST(char, 6); + TEST(char, 8); + TEST(int, 1); + TEST(int, 2); + TEST(int, 3); + TEST(int, 4); + TEST(int, 6); + TEST(int, 8); + TEST(int, 16); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 13 "vect" } } */