This allows all permutations we can generate (according to the target). Bootstrap and regtest pending on x86_64-unknown-linux-gnu.
Richard. 2015-06-03 Richard Biener <rguent...@suse.de> * tree-vect-stmts.c (vectorizable_load): Compute the pointer adjustment for gaps at the end of a SLP load group properly. * tree-vect-slp.c (vect_supported_load_permutation_p): Allow all permutations we can generate. * gcc.dg/vect/slp-perm-10.c: New testcase. * gcc.dg/vect/slp-23.c: Adjust. Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c (revision 224061) --- gcc/tree-vect-stmts.c (working copy) *************** vectorizable_load (gimple stmt, gimple_s *** 5807,5813 **** gimple ptr_incr = NULL; int nunits = TYPE_VECTOR_SUBPARTS (vectype); int ncopies; ! int i, j, group_size = -1, group_gap; tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree byte_offset = NULL_TREE; --- 5807,5813 ---- gimple ptr_incr = NULL; int nunits = TYPE_VECTOR_SUBPARTS (vectype); int ncopies; ! int i, j, group_size = -1, group_gap_adj; tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree byte_offset = NULL_TREE; *************** vectorizable_load (gimple stmt, gimple_s *** 6402,6413 **** { grouped_load = false; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt)); } else { vec_num = group_size; ! group_gap = 0; } } else --- 6402,6413 ---- { grouped_load = false; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! group_gap_adj = vf * group_size - nunits * vec_num; } else { vec_num = group_size; ! group_gap_adj = 0; } } else *************** vectorizable_load (gimple stmt, gimple_s *** 6415,6421 **** first_stmt = stmt; first_dr = dr; group_size = vec_num = 1; ! group_gap = 0; } alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); --- 6415,6421 ---- first_stmt = stmt; first_dr = dr; group_size = vec_num = 1; ! group_gap_adj = 0; } alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); *************** vectorizable_load (gimple stmt, gimple_s *** 6832,6842 **** SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); } /* Bump the vector pointer to account for a gap. */ ! if (slp && group_gap != 0) { tree bump = size_binop (MULT_EXPR, TYPE_SIZE_UNIT (elem_type), ! size_int (group_gap)); dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, bump); } --- 6832,6842 ---- SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); } /* Bump the vector pointer to account for a gap. */ ! if (group_gap_adj != 0) { tree bump = size_binop (MULT_EXPR, TYPE_SIZE_UNIT (elem_type), ! size_int (group_gap_adj)); dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, bump); } Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c (revision 224061) --- gcc/tree-vect-slp.c (working copy) *************** vect_supported_load_permutation_p (slp_i *** 1506,1552 **** return true; } ! /* FORNOW: the only supported permutation is 0..01..1.. of length equal to ! GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as ! well (unless it's reduction). */ ! if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size) ! return false; ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! if (!node->load_permutation.exists ()) ! return false; ! ! load_index = sbitmap_alloc (group_size); ! bitmap_clear (load_index); ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! { ! unsigned int lidx = node->load_permutation[0]; ! if (bitmap_bit_p (load_index, lidx)) ! { ! sbitmap_free (load_index); ! return false; ! } ! bitmap_set_bit (load_index, lidx); ! FOR_EACH_VEC_ELT (node->load_permutation, j, k) ! if (k != lidx) ! { ! sbitmap_free (load_index); ! return false; ! } ! } ! for (i = 0; i < group_size; i++) ! if (!bitmap_bit_p (load_index, i)) ! { ! sbitmap_free (load_index); ! return false; ! } ! sbitmap_free (load_index); ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (node->load_permutation.exists () && !vect_transform_slp_perm_load (node, vNULL, NULL, SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true)) return false; return true; } --- 1504,1517 ---- return true; } ! /* For loop vectorization verify we can generate the permutation. */ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (node->load_permutation.exists () && !vect_transform_slp_perm_load (node, vNULL, NULL, SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true)) return false; + return true; } Index: gcc/testsuite/gcc.dg/vect/slp-23.c =================================================================== *** gcc/testsuite/gcc.dg/vect/slp-23.c (revision 224061) --- gcc/testsuite/gcc.dg/vect/slp-23.c (working copy) *************** int main (void) *** 108,112 **** /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_strided8 && { ! { vect_no_align} } } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided8 || vect_no_align } } } } } */ ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ --- 108,113 ---- /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_strided8 && { ! { vect_no_align} } } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided8 || vect_no_align } } } } } */ ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_perm } } } } */ ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */ Index: gcc/testsuite/gcc.dg/vect/slp-perm-10.c =================================================================== *** gcc/testsuite/gcc.dg/vect/slp-perm-10.c (revision 0) --- gcc/testsuite/gcc.dg/vect/slp-perm-10.c (working copy) *************** *** 0 **** --- 1,53 ---- + /* { dg-require-effective-target vect_int } */ + + #include "tree-vect.h" + + int a[256], b[256]; + + void __attribute__((noinline)) + foo (void) + { + int i; + for (i = 0; i < 32; ++i) + { + b[i*8+0] = a[i*8+0]; + b[i*8+1] = a[i*8+0]; + b[i*8+2] = a[i*8+3]; + b[i*8+3] = a[i*8+3]; + b[i*8+4] = a[i*8+4]; + b[i*8+5] = a[i*8+6]; + b[i*8+6] = a[i*8+4]; + b[i*8+7] = a[i*8+6]; + } + } + + int main () + { + int i; + + check_vect (); + + for (i = 0; i < 256; ++i) + { + a[i] = i; + __asm__ volatile (""); + } + + foo (); + + for (i = 0; i < 32; ++i) + if (b[i*8+0] != i*8+0 + || b[i*8+1] != i*8+0 + || b[i*8+2] != i*8+3 + || b[i*8+3] != i*8+3 + || b[i*8+4] != i*8+4 + || b[i*8+5] != i*8+6 + || b[i*8+6] != i*8+4 + || b[i*8+7] != i*8+6) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ + /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */