I've added a bug report for the stores group case: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403
On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstu...@gmail.com> wrote: > Ping. > Test is modified according to the fix in the test for loads. > > diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c > b/gcc/testsuite/gcc.dg/vect/pr52252-st.c > new file mode 100644 > index 0000000..e7161f7 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c > @@ -0,0 +1,21 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */ > + > +#define byte unsigned char > + > +void > +matrix_mul (byte *in, byte *out, int size) > +{ > + int i; > + for (i = 0; i < size; i++) > + { > + out[0] = in[0] + in[1] + in[3]; > + out[1] = in[0] + in[2] + in[4]; > + out[2] = in[1] + in[2] + in[4]; > + in += 4; > + out += 3; > + } > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { > target { i?86-*-* x86_64-*-* } } } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */ > > > On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstu...@gmail.com> wrote: >> 2nd part of patch is on stores group. >> Bootstrap and make check passed on x86. >> >> Is it ok? >> >> 2014-05-06 Evgeny Stupachenko <evstu...@gmail.com> >> >> * tree-vect-data-refs.c (vect_grouped_store_supported): New >> check for storess group of length 3. >> (vect_permute_store_chain): New permutations for storess group of >> length 3. >> * tree-vect-stmts.c (vect_model_store_cost): Change cost >> of vec_perm_shuffle for the new permutations. >> >> ChangeLog for testsuite: >> >> 2014-05-06 Evgeny Stupachenko <evstu...@gmail.com> >> >> PR tree-optimization/52252 >> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3. >> >> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c >> index ef710cf..fb0e30d 100644 >> --- a/gcc/tree-vect-data-refs.c >> +++ b/gcc/tree-vect-data-refs.c >> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype, >> unsigned HOST_WIDE_INT count) >> { >> enum machine_mode mode = TYPE_MODE (vectype); >> >> - /* vect_permute_store_chain requires the group size to be a power of two. >> */ >> - if (exact_log2 (count) == -1) >> + /* vect_permute_store_chain requires the group size to be equal to 3 or >> + be a power of two. */ >> + if (count != 3 && exact_log2 (count) == -1) >> { >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> - "the size of the group of accesses" >> - " is not a power of 2\n"); >> + "the size of the group of accesses" >> + " is not a power of 2 or not eqaul to 3\n"); >> return false; >> } >> >> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype, >> unsigned HOST_WIDE_INT count) >> { >> unsigned int i, nelt = GET_MODE_NUNITS (mode); >> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); >> - for (i = 0; i < nelt / 2; i++) >> + >> + if (count == 3) >> { >> - sel[i * 2] = i; >> - sel[i * 2 + 1] = i + nelt; >> + unsigned int j0 = 0, j1 = 0, j2 = 0; >> + unsigned int i, j; >> + >> + for (j = 0; j < 3; j++) >> + { >> + int nelt0 = ((3 - j) * nelt) % 3; >> + int nelt1 = ((3 - j) * nelt + 1) % 3; >> + int nelt2 = ((3 - j) * nelt + 2) % 3; >> + for (i = 0; i < nelt; i++) >> + { >> + if (3 * i + nelt0 < nelt) >> + sel[3 * i + nelt0] = j0++; >> + if (3 * i + nelt1 < nelt) >> + sel[3 * i + nelt1] = nelt + j1++; >> + if (3 * i + nelt2 < nelt) >> + sel[3 * i + nelt2] = 0; >> + } >> + if (!can_vec_perm_p (mode, false, sel)) >> + { >> + if (dump_enabled_p ()) >> + dump_printf (MSG_MISSED_OPTIMIZATION, >> + "permutaion op not supported by target.\n"); >> + return false; >> + } >> + >> + for (i = 0; i < nelt; i++) >> + { >> + if (3 * i + nelt0 < nelt) >> + sel[3 * i + nelt0] = 3 * i + nelt0; >> + if (3 * i + nelt1 < nelt) >> + sel[3 * i + nelt1] = 3 * i + nelt1; >> + if (3 * i + nelt2 < nelt) >> + sel[3 * i + nelt2] = nelt + j2++; >> + } >> + if (!can_vec_perm_p (mode, false, sel)) >> + { >> + if (dump_enabled_p ()) >> + dump_printf (MSG_MISSED_OPTIMIZATION, >> + "permutaion op not supported by target.\n"); >> + return false; >> + } >> + } >> + return true; >> } >> - if (can_vec_perm_p (mode, false, sel)) >> + else >> { >> - for (i = 0; i < nelt; i++) >> - sel[i] += nelt / 2; >> - if (can_vec_perm_p (mode, false, sel)) >> - return true; >> + /* If length is not equal to 3 then only power of 2 is supported. >> */ >> + gcc_assert (exact_log2 (count) != -1); >> + >> + for (i = 0; i < nelt / 2; i++) >> + { >> + sel[i * 2] = i; >> + sel[i * 2 + 1] = i + nelt; >> + } >> + if (can_vec_perm_p (mode, false, sel)) >> + { >> + for (i = 0; i < nelt; i++) >> + sel[i] += nelt / 2; >> + if (can_vec_perm_p (mode, false, sel)) >> + return true; >> + } >> } >> } >> >> if (dump_enabled_p ()) >> dump_printf (MSG_MISSED_OPTIMIZATION, >> - "interleave op not supported by target.\n"); >> + "permutaion op not supported by target.\n"); >> return false; >> } >> >> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype, >> unsigned HOST_WIDE_INT count) >> /* Function vect_permute_store_chain. >> >> Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be >> - a power of 2, generate interleave_high/low stmts to reorder the data >> - correctly for the stores. Return the final references for stores in >> - RESULT_CHAIN. >> + a power of 2 or equal to 3, generate interleave_high/low stmts to reorder >> + the data correctly for the stores. Return the final references for >> stores >> + in RESULT_CHAIN. >> >> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. >> The input is 4 vectors each containing 8 elements. We assign a number to >> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain, >> gimple perm_stmt; >> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); >> tree perm_mask_low, perm_mask_high; >> - unsigned int i, n; >> + tree data_ref; >> + tree perm3_mask_low, perm3_mask_high; >> + unsigned int i, n, log_length = exact_log2 (length); >> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); >> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); >> >> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain, >> memcpy (result_chain->address (), dr_chain.address (), >> length * sizeof (tree)); >> >> - for (i = 0, n = nelt / 2; i < n; i++) >> + if (length == 3) >> { >> - sel[i * 2] = i; >> - sel[i * 2 + 1] = i + nelt; >> - } >> - perm_mask_high = vect_gen_perm_mask (vectype, sel); >> - gcc_assert (perm_mask_high != NULL); >> + unsigned int j0 = 0, j1 = 0, j2 = 0; >> >> - for (i = 0; i < nelt; i++) >> - sel[i] += nelt / 2; >> - perm_mask_low = vect_gen_perm_mask (vectype, sel); >> - gcc_assert (perm_mask_low != NULL); >> + for (j = 0; j < 3; j++) >> + { >> + int nelt0 = ((3 - j) * nelt) % 3; >> + int nelt1 = ((3 - j) * nelt + 1) % 3; >> + int nelt2 = ((3 - j) * nelt + 2) % 3; >> >> - for (i = 0, n = exact_log2 (length); i < n; i++) >> - { >> - for (j = 0; j < length/2; j++) >> - { >> - vect1 = dr_chain[j]; >> - vect2 = dr_chain[j+length/2]; >> + for (i = 0; i < nelt; i++) >> + { >> + if (3 * i + nelt0 < nelt) >> + sel[3 * i + nelt0] = j0++; >> + if (3 * i + nelt1 < nelt) >> + sel[3 * i + nelt1] = nelt + j1++; >> + if (3 * i + nelt2 < nelt) >> + sel[3 * i + nelt2] = 0; >> + } >> + perm3_mask_low = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm3_mask_low != NULL); >> + >> + for (i = 0; i < nelt; i++) >> + { >> + if (3 * i + nelt0 < nelt) >> + sel[3 * i + nelt0] = 3 * i + nelt0; >> + if (3 * i + nelt1 < nelt) >> + sel[3 * i + nelt1] = 3 * i + nelt1; >> + if (3 * i + nelt2 < nelt) >> + sel[3 * i + nelt2] = nelt + j2++; >> + } >> + perm3_mask_high = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm3_mask_high != NULL); >> + >> + vect1 = dr_chain[0]; >> + vect2 = dr_chain[1]; >> >> /* Create interleaving stmt: >> - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> >> */ >> - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); >> - perm_stmt >> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, >> - vect1, vect2, perm_mask_high); >> + low = VEC_PERM_EXPR <vect1, vect2, >> + {j, nelt, *, j + 1, nelt + j + 1, *, >> + j + 2, nelt + j + 2, *, ...}> */ >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); >> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >> + vect1, vect2, >> + perm3_mask_low); >> vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> - (*result_chain)[2*j] = high; >> >> + vect1 = data_ref; >> + vect2 = dr_chain[2]; >> /* Create interleaving stmt: >> - low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1, >> - nelt*3/2+1, ...}> */ >> - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); >> - perm_stmt >> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, >> - vect1, vect2, perm_mask_low); >> + low = VEC_PERM_EXPR <vect1, vect2, >> + {0, 1, nelt + j, 3, 4, nelt + j + 1, >> + 6, 7, nelt + j + 2, ...}> */ >> + data_ref = make_temp_ssa_name (vectype, NULL, >> "vect_shuffle3_high"); >> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >> + vect1, vect2, >> + perm3_mask_high); >> vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> - (*result_chain)[2*j+1] = low; >> + (*result_chain)[j] = data_ref; >> + } >> + } >> + else >> + { >> + /* If length is not equal to 3 then only power of 2 is supported. */ >> + gcc_assert (exact_log2 (length) != -1); >> + >> + for (i = 0, n = nelt / 2; i < n; i++) >> + { >> + sel[i * 2] = i; >> + sel[i * 2 + 1] = i + nelt; >> } >> - memcpy (dr_chain.address (), result_chain->address (), >> - length * sizeof (tree)); >> + perm_mask_high = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm_mask_high != NULL); >> + >> + for (i = 0; i < nelt; i++) >> + sel[i] += nelt / 2; >> + perm_mask_low = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm_mask_low != NULL); >> + >> + for (i = 0, n = log_length; i < n; i++) >> + { >> + for (j = 0; j < length/2; j++) >> + { >> + vect1 = dr_chain[j]; >> + vect2 = dr_chain[j+length/2]; >> + >> + /* Create interleaving stmt: >> + high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, >> + ...}> */ >> + high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); >> + perm_stmt >> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, >> + vect1, vect2, >> perm_mask_high); >> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> + (*result_chain)[2*j] = high; >> + >> + /* Create interleaving stmt: >> + low = VEC_PERM_EXPR <vect1, vect2, >> + {nelt/2, nelt*3/2, nelt/2+1, >> nelt*3/2+1, >> + ...}> */ >> + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); >> + perm_stmt >> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, >> + vect1, vect2, >> perm_mask_low); >> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> + (*result_chain)[2*j+1] = low; >> + } >> + memcpy (dr_chain.address (), result_chain->address (), >> + length * sizeof (tree)); >> + } >> } >> } >> >> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c >> index b87c143..24d0b94 100644 >> --- a/gcc/tree-vect-stmts.c >> +++ b/gcc/tree-vect-stmts.c >> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int >> ncopies, >> include the cost of the permutes. */ >> if (!store_lanes_p && group_size > 1) >> { >> - /* Uses a high and low interleave operation for each needed permute. >> */ >> - >> - int nstmts = ncopies * exact_log2 (group_size) * group_size; >> + /* Uses a high and low interleave or shuffle operations for each >> + needed permute. */ >> + int nstmts = ncopies * ceil_log2 (group_size) * group_size; >> inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, >> stmt_info, 0, vect_body); >> >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c >> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c >> new file mode 100644 >> index 0000000..cc1e72e >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c >> @@ -0,0 +1,21 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 >> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ >> + >> +#define byte unsigned char >> + >> +void >> +matrix_mul (byte *in, byte *out, int size) >> +{ >> + int i; >> + for (i = 0; i < size; i++) >> + { >> + out[0] = in[0] + in[1] + in[3]; >> + out[1] = in[0] + in[2] + in[4]; >> + out[2] = in[1] + in[2] + in[4]; >> + in += 4; >> + out += 3; >> + } >> +} >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ >> +/* { dg-final { cleanup-tree-dump "vect" } } */