On Wed, 3 Jun 2015, Richard Biener wrote: > > This allows all permutations we can generate (according to the target). > > Bootstrap and regtest pending on x86_64-unknown-linux-gnu.
So this turned up other issues thus the following is what I have committed after bootstrapping and testing on x86_64-unknown-linux-gnu. Richard. 2015-06-08 Richard Biener <rguent...@suse.de> * tree-vect-stmts.c (vectorizable_load): Compute the pointer adjustment for gaps at the end of a SLP load group properly. * tree-vect-slp.c (vect_supported_load_permutation_p): Allow all permutations we can generate. (vect_transform_slp_perm_load): Use the correct group-size. * gcc.dg/vect/slp-perm-10.c: New testcase. * gcc.dg/vect/slp-23.c: Adjust. * gcc.dg/torture/pr53366-2.c: Also verify cross-iteration vector pointer update. Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c (revision 224077) --- gcc/tree-vect-stmts.c (working copy) *************** vectorizable_load (gimple stmt, gimple_s *** 5807,5813 **** gimple ptr_incr = NULL; int nunits = TYPE_VECTOR_SUBPARTS (vectype); int ncopies; ! int i, j, group_size = -1, group_gap; tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree byte_offset = NULL_TREE; --- 5807,5813 ---- gimple ptr_incr = NULL; int nunits = TYPE_VECTOR_SUBPARTS (vectype); int ncopies; ! int i, j, group_size = -1, group_gap_adj; tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree byte_offset = NULL_TREE; *************** vectorizable_load (gimple stmt, gimple_s *** 6396,6421 **** } first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); /* VEC_NUM is the number of vect stmts to be created for this group. */ if (slp) { grouped_load = false; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt)); } else ! { ! vec_num = group_size; ! group_gap = 0; ! } } else { first_stmt = stmt; first_dr = dr; group_size = vec_num = 1; ! group_gap = 0; } alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); --- 6396,6419 ---- } first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); + group_gap_adj = 0; /* VEC_NUM is the number of vect stmts to be created for this group. */ if (slp) { grouped_load = false; vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! group_gap_adj = vf * group_size - nunits * vec_num; } else ! vec_num = group_size; } else { first_stmt = stmt; first_dr = dr; group_size = vec_num = 1; ! group_gap_adj = 0; } alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); *************** vectorizable_load (gimple stmt, gimple_s *** 6831,6842 **** if (slp && !slp_perm) SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); } ! /* Bump the vector pointer to account for a gap. */ ! if (slp && group_gap != 0) { ! tree bump = size_binop (MULT_EXPR, ! TYPE_SIZE_UNIT (elem_type), ! size_int (group_gap)); dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, bump); } --- 6829,6843 ---- if (slp && !slp_perm) SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); } ! /* Bump the vector pointer to account for a gap or for excess ! elements loaded for a permuted SLP load. */ ! if (group_gap_adj != 0) { ! bool ovf; ! tree bump ! = wide_int_to_tree (sizetype, ! wi::smul (TYPE_SIZE_UNIT (elem_type), ! group_gap_adj, &ovf)); dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, bump); } Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c (revision 224077) --- gcc/tree-vect-slp.c (working copy) *************** vect_supported_load_permutation_p (slp_i *** 1502,1548 **** return true; } ! /* FORNOW: the only supported permutation is 0..01..1.. of length equal to ! GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as ! well (unless it's reduction). */ ! if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size) ! return false; ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! if (!node->load_permutation.exists ()) ! return false; ! ! load_index = sbitmap_alloc (group_size); ! bitmap_clear (load_index); ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! { ! unsigned int lidx = node->load_permutation[0]; ! if (bitmap_bit_p (load_index, lidx)) ! { ! sbitmap_free (load_index); ! return false; ! } ! bitmap_set_bit (load_index, lidx); ! FOR_EACH_VEC_ELT (node->load_permutation, j, k) ! if (k != lidx) ! { ! sbitmap_free (load_index); ! return false; ! } ! } ! for (i = 0; i < group_size; i++) ! if (!bitmap_bit_p (load_index, i)) ! { ! sbitmap_free (load_index); ! return false; ! } ! sbitmap_free (load_index); ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (node->load_permutation.exists () && !vect_transform_slp_perm_load (node, vNULL, NULL, SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true)) return false; return true; } --- 1517,1530 ---- return true; } ! /* For loop vectorization verify we can generate the permutation. */ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (node->load_permutation.exists () && !vect_transform_slp_perm_load (node, vNULL, NULL, SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true)) return false; + return true; } *************** vect_transform_slp_perm_load (slp_tree n *** 3287,3292 **** --- 3269,3276 ---- if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) return false; + stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)); + /* Generate permutation masks for every NODE. Number of masks for each NODE is equal to GROUP_SIZE. E.g., we have a group of three nodes with three loads from the same *************** vect_transform_slp_perm_load (slp_tree n *** 3321,3327 **** for (k = 0; k < group_size; k++) { i = SLP_TREE_LOAD_PERMUTATION (node)[k]; ! first_mask_element = i + j * group_size; if (!vect_get_mask_element (stmt, first_mask_element, 0, nunits, only_one_vec, index, mask, ¤t_mask_element, --- 3305,3311 ---- for (k = 0; k < group_size; k++) { i = SLP_TREE_LOAD_PERMUTATION (node)[k]; ! first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info); if (!vect_get_mask_element (stmt, first_mask_element, 0, nunits, only_one_vec, index, mask, ¤t_mask_element, Index: gcc/testsuite/gcc.dg/vect/slp-perm-10.c =================================================================== *** gcc/testsuite/gcc.dg/vect/slp-perm-10.c (revision 0) --- gcc/testsuite/gcc.dg/vect/slp-perm-10.c (working copy) *************** *** 0 **** --- 1,53 ---- + /* { dg-require-effective-target vect_int } */ + + #include "tree-vect.h" + + int a[256], b[256]; + + void __attribute__((noinline)) + foo (void) + { + int i; + for (i = 0; i < 32; ++i) + { + b[i*8+0] = a[i*8+0]; + b[i*8+1] = a[i*8+0]; + b[i*8+2] = a[i*8+3]; + b[i*8+3] = a[i*8+3]; + b[i*8+4] = a[i*8+4]; + b[i*8+5] = a[i*8+6]; + b[i*8+6] = a[i*8+4]; + b[i*8+7] = a[i*8+6]; + } + } + + int main () + { + int i; + + check_vect (); + + for (i = 0; i < 256; ++i) + { + a[i] = i; + __asm__ volatile (""); + } + + foo (); + + for (i = 0; i < 32; ++i) + if (b[i*8+0] != i*8+0 + || b[i*8+1] != i*8+0 + || b[i*8+2] != i*8+3 + || b[i*8+3] != i*8+3 + || b[i*8+4] != i*8+4 + || b[i*8+5] != i*8+6 + || b[i*8+6] != i*8+4 + || b[i*8+7] != i*8+6) + abort (); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ + /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */ Index: gcc/testsuite/gcc.dg/vect/slp-23.c =================================================================== *** gcc/testsuite/gcc.dg/vect/slp-23.c (revision 224077) --- gcc/testsuite/gcc.dg/vect/slp-23.c (working copy) *************** int main (void) *** 108,112 **** /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_strided8 && { ! { vect_no_align} } } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided8 || vect_no_align } } } } } */ ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ --- 108,113 ---- /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_strided8 && { ! { vect_no_align} } } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided8 || vect_no_align } } } } } */ ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_perm } } } } */ ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */ Index: gcc/testsuite/gcc.dg/torture/pr53366-2.c =================================================================== *** gcc/testsuite/gcc.dg/torture/pr53366-2.c (revision 224077) --- gcc/testsuite/gcc.dg/torture/pr53366-2.c (working copy) *************** *** 4,21 **** extern void abort (void); struct T { float r[3], i[3]; }; ! struct U { struct T j[2]; }; void __attribute__ ((noinline)) foo (struct U *__restrict y, const float _Complex *__restrict x) { int i, j; ! for (j = 0; j < 2; ++j) { float a = __real__ x[j]; float b = __imag__ x[j]; ! float c = __real__ x[j + 2]; ! float d = __imag__ x[j + 2]; for (i = 0; i < 3; ++i) { y->j[j].r[i] = y->j[j].r[i] + a + c; --- 4,21 ---- extern void abort (void); struct T { float r[3], i[3]; }; ! struct U { struct T j[4]; }; void __attribute__ ((noinline)) foo (struct U *__restrict y, const float _Complex *__restrict x) { int i, j; ! for (j = 0; j < 4; ++j) { float a = __real__ x[j]; float b = __imag__ x[j]; ! float c = __real__ x[j + 4]; ! float d = __imag__ x[j + 4]; for (i = 0; i < 3; ++i) { y->j[j].r[i] = y->j[j].r[i] + a + c; *************** foo (struct U *__restrict y, const float *** 24,43 **** } } ! _Complex float x[4]; struct U y; int main () { int i, j; ! for (i = 0; i < 4; ++i) ! x[i] = i + 1.0iF * (2 * i); foo (&y, x); ! for (j = 0; j < 2; ++j) for (i = 0; i < 3; ++i) ! if (y.j[j].r[i] != __real__ (x[j] + x[j + 2]) ! || y.j[j].i[i] != __imag__ (x[j] + x[j + 2])) __builtin_abort (); return 0; } --- 24,46 ---- } } ! _Complex float x[8]; struct U y; int main () { int i, j; ! for (i = 0; i < 8; ++i) ! { ! x[i] = i + 1.0iF * (2 * i); ! __asm__ volatile (""); ! } foo (&y, x); ! for (j = 0; j < 4; ++j) for (i = 0; i < 3; ++i) ! if (y.j[j].r[i] != __real__ (x[j] + x[j + 4]) ! || y.j[j].i[i] != __imag__ (x[j] + x[j + 4])) __builtin_abort (); return 0; }