ping. The changes are similar to already committed on loads group.
On Tue, Jun 3, 2014 at 5:22 PM, Evgeny Stupachenko <evstu...@gmail.com> wrote: > I've added a bug report for the stores group case: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403 > > > On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstu...@gmail.com> > wrote: >> Ping. >> Test is modified according to the fix in the test for loads. >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c >> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c >> new file mode 100644 >> index 0000000..e7161f7 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c >> @@ -0,0 +1,21 @@ >> +/* { dg-do compile } */ >> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */ >> + >> +#define byte unsigned char >> + >> +void >> +matrix_mul (byte *in, byte *out, int size) >> +{ >> + int i; >> + for (i = 0; i < size; i++) >> + { >> + out[0] = in[0] + in[1] + in[3]; >> + out[1] = in[0] + in[2] + in[4]; >> + out[2] = in[1] + in[2] + in[4]; >> + in += 4; >> + out += 3; >> + } >> +} >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { >> target { i?86-*-* x86_64-*-* } } } } */ >> +/* { dg-final { cleanup-tree-dump "vect" } } */ >> >> >> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstu...@gmail.com> >> wrote: >>> 2nd part of patch is on stores group. >>> Bootstrap and make check passed on x86. >>> >>> Is it ok? >>> >>> 2014-05-06 Evgeny Stupachenko <evstu...@gmail.com> >>> >>> * tree-vect-data-refs.c (vect_grouped_store_supported): New >>> check for storess group of length 3. >>> (vect_permute_store_chain): New permutations for storess group of >>> length 3. >>> * tree-vect-stmts.c (vect_model_store_cost): Change cost >>> of vec_perm_shuffle for the new permutations. >>> >>> ChangeLog for testsuite: >>> >>> 2014-05-06 Evgeny Stupachenko <evstu...@gmail.com> >>> >>> PR tree-optimization/52252 >>> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3. >>> >>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c >>> index ef710cf..fb0e30d 100644 >>> --- a/gcc/tree-vect-data-refs.c >>> +++ b/gcc/tree-vect-data-refs.c >>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype, >>> unsigned HOST_WIDE_INT count) >>> { >>> enum machine_mode mode = TYPE_MODE (vectype); >>> >>> - /* vect_permute_store_chain requires the group size to be a power of >>> two. */ >>> - if (exact_log2 (count) == -1) >>> + /* vect_permute_store_chain requires the group size to be equal to 3 or >>> + be a power of two. */ >>> + if (count != 3 && exact_log2 (count) == -1) >>> { >>> if (dump_enabled_p ()) >>> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >>> - "the size of the group of accesses" >>> - " is not a power of 2\n"); >>> + "the size of the group of accesses" >>> + " is not a power of 2 or not eqaul to 3\n"); >>> return false; >>> } >>> >>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype, >>> unsigned HOST_WIDE_INT count) >>> { >>> unsigned int i, nelt = GET_MODE_NUNITS (mode); >>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); >>> - for (i = 0; i < nelt / 2; i++) >>> + >>> + if (count == 3) >>> { >>> - sel[i * 2] = i; >>> - sel[i * 2 + 1] = i + nelt; >>> + unsigned int j0 = 0, j1 = 0, j2 = 0; >>> + unsigned int i, j; >>> + >>> + for (j = 0; j < 3; j++) >>> + { >>> + int nelt0 = ((3 - j) * nelt) % 3; >>> + int nelt1 = ((3 - j) * nelt + 1) % 3; >>> + int nelt2 = ((3 - j) * nelt + 2) % 3; >>> + for (i = 0; i < nelt; i++) >>> + { >>> + if (3 * i + nelt0 < nelt) >>> + sel[3 * i + nelt0] = j0++; >>> + if (3 * i + nelt1 < nelt) >>> + sel[3 * i + nelt1] = nelt + j1++; >>> + if (3 * i + nelt2 < nelt) >>> + sel[3 * i + nelt2] = 0; >>> + } >>> + if (!can_vec_perm_p (mode, false, sel)) >>> + { >>> + if (dump_enabled_p ()) >>> + dump_printf (MSG_MISSED_OPTIMIZATION, >>> + "permutaion op not supported by >>> target.\n"); >>> + return false; >>> + } >>> + >>> + for (i = 0; i < nelt; i++) >>> + { >>> + if (3 * i + nelt0 < nelt) >>> + sel[3 * i + nelt0] = 3 * i + nelt0; >>> + if (3 * i + nelt1 < nelt) >>> + sel[3 * i + nelt1] = 3 * i + nelt1; >>> + if (3 * i + nelt2 < nelt) >>> + sel[3 * i + nelt2] = nelt + j2++; >>> + } >>> + if (!can_vec_perm_p (mode, false, sel)) >>> + { >>> + if (dump_enabled_p ()) >>> + dump_printf (MSG_MISSED_OPTIMIZATION, >>> + "permutaion op not supported by >>> target.\n"); >>> + return false; >>> + } >>> + } >>> + return true; >>> } >>> - if (can_vec_perm_p (mode, false, sel)) >>> + else >>> { >>> - for (i = 0; i < nelt; i++) >>> - sel[i] += nelt / 2; >>> - if (can_vec_perm_p (mode, false, sel)) >>> - return true; >>> + /* If length is not equal to 3 then only power of 2 is supported. >>> */ >>> + gcc_assert (exact_log2 (count) != -1); >>> + >>> + for (i = 0; i < nelt / 2; i++) >>> + { >>> + sel[i * 2] = i; >>> + sel[i * 2 + 1] = i + nelt; >>> + } >>> + if (can_vec_perm_p (mode, false, sel)) >>> + { >>> + for (i = 0; i < nelt; i++) >>> + sel[i] += nelt / 2; >>> + if (can_vec_perm_p (mode, false, sel)) >>> + return true; >>> + } >>> } >>> } >>> >>> if (dump_enabled_p ()) >>> dump_printf (MSG_MISSED_OPTIMIZATION, >>> - "interleave op not supported by target.\n"); >>> + "permutaion op not supported by target.\n"); >>> return false; >>> } >>> >>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype, >>> unsigned HOST_WIDE_INT count) >>> /* Function vect_permute_store_chain. >>> >>> Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be >>> - a power of 2, generate interleave_high/low stmts to reorder the data >>> - correctly for the stores. Return the final references for stores in >>> - RESULT_CHAIN. >>> + a power of 2 or equal to 3, generate interleave_high/low stmts to >>> reorder >>> + the data correctly for the stores. Return the final references for >>> stores >>> + in RESULT_CHAIN. >>> >>> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. >>> The input is 4 vectors each containing 8 elements. We assign a number >>> to >>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain, >>> gimple perm_stmt; >>> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); >>> tree perm_mask_low, perm_mask_high; >>> - unsigned int i, n; >>> + tree data_ref; >>> + tree perm3_mask_low, perm3_mask_high; >>> + unsigned int i, n, log_length = exact_log2 (length); >>> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); >>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); >>> >>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain, >>> memcpy (result_chain->address (), dr_chain.address (), >>> length * sizeof (tree)); >>> >>> - for (i = 0, n = nelt / 2; i < n; i++) >>> + if (length == 3) >>> { >>> - sel[i * 2] = i; >>> - sel[i * 2 + 1] = i + nelt; >>> - } >>> - perm_mask_high = vect_gen_perm_mask (vectype, sel); >>> - gcc_assert (perm_mask_high != NULL); >>> + unsigned int j0 = 0, j1 = 0, j2 = 0; >>> >>> - for (i = 0; i < nelt; i++) >>> - sel[i] += nelt / 2; >>> - perm_mask_low = vect_gen_perm_mask (vectype, sel); >>> - gcc_assert (perm_mask_low != NULL); >>> + for (j = 0; j < 3; j++) >>> + { >>> + int nelt0 = ((3 - j) * nelt) % 3; >>> + int nelt1 = ((3 - j) * nelt + 1) % 3; >>> + int nelt2 = ((3 - j) * nelt + 2) % 3; >>> >>> - for (i = 0, n = exact_log2 (length); i < n; i++) >>> - { >>> - for (j = 0; j < length/2; j++) >>> - { >>> - vect1 = dr_chain[j]; >>> - vect2 = dr_chain[j+length/2]; >>> + for (i = 0; i < nelt; i++) >>> + { >>> + if (3 * i + nelt0 < nelt) >>> + sel[3 * i + nelt0] = j0++; >>> + if (3 * i + nelt1 < nelt) >>> + sel[3 * i + nelt1] = nelt + j1++; >>> + if (3 * i + nelt2 < nelt) >>> + sel[3 * i + nelt2] = 0; >>> + } >>> + perm3_mask_low = vect_gen_perm_mask (vectype, sel); >>> + gcc_assert (perm3_mask_low != NULL); >>> + >>> + for (i = 0; i < nelt; i++) >>> + { >>> + if (3 * i + nelt0 < nelt) >>> + sel[3 * i + nelt0] = 3 * i + nelt0; >>> + if (3 * i + nelt1 < nelt) >>> + sel[3 * i + nelt1] = 3 * i + nelt1; >>> + if (3 * i + nelt2 < nelt) >>> + sel[3 * i + nelt2] = nelt + j2++; >>> + } >>> + perm3_mask_high = vect_gen_perm_mask (vectype, sel); >>> + gcc_assert (perm3_mask_high != NULL); >>> + >>> + vect1 = dr_chain[0]; >>> + vect2 = dr_chain[1]; >>> >>> /* Create interleaving stmt: >>> - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> >>> */ >>> - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); >>> - perm_stmt >>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, >>> - vect1, vect2, perm_mask_high); >>> + low = VEC_PERM_EXPR <vect1, vect2, >>> + {j, nelt, *, j + 1, nelt + j + 1, *, >>> + j + 2, nelt + j + 2, *, ...}> */ >>> + data_ref = make_temp_ssa_name (vectype, NULL, >>> "vect_shuffle3_low"); >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >>> + vect1, vect2, >>> + perm3_mask_low); >>> vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> - (*result_chain)[2*j] = high; >>> >>> + vect1 = data_ref; >>> + vect2 = dr_chain[2]; >>> /* Create interleaving stmt: >>> - low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1, >>> - nelt*3/2+1, ...}> */ >>> - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); >>> - perm_stmt >>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, >>> - vect1, vect2, perm_mask_low); >>> + low = VEC_PERM_EXPR <vect1, vect2, >>> + {0, 1, nelt + j, 3, 4, nelt + j + 1, >>> + 6, 7, nelt + j + 2, ...}> */ >>> + data_ref = make_temp_ssa_name (vectype, NULL, >>> "vect_shuffle3_high"); >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >>> + vect1, vect2, >>> + perm3_mask_high); >>> vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> - (*result_chain)[2*j+1] = low; >>> + (*result_chain)[j] = data_ref; >>> + } >>> + } >>> + else >>> + { >>> + /* If length is not equal to 3 then only power of 2 is supported. */ >>> + gcc_assert (exact_log2 (length) != -1); >>> + >>> + for (i = 0, n = nelt / 2; i < n; i++) >>> + { >>> + sel[i * 2] = i; >>> + sel[i * 2 + 1] = i + nelt; >>> } >>> - memcpy (dr_chain.address (), result_chain->address (), >>> - length * sizeof (tree)); >>> + perm_mask_high = vect_gen_perm_mask (vectype, sel); >>> + gcc_assert (perm_mask_high != NULL); >>> + >>> + for (i = 0; i < nelt; i++) >>> + sel[i] += nelt / 2; >>> + perm_mask_low = vect_gen_perm_mask (vectype, sel); >>> + gcc_assert (perm_mask_low != NULL); >>> + >>> + for (i = 0, n = log_length; i < n; i++) >>> + { >>> + for (j = 0; j < length/2; j++) >>> + { >>> + vect1 = dr_chain[j]; >>> + vect2 = dr_chain[j+length/2]; >>> + >>> + /* Create interleaving stmt: >>> + high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, >>> + ...}> */ >>> + high = make_temp_ssa_name (vectype, NULL, >>> "vect_inter_high"); >>> + perm_stmt >>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, >>> + vect1, vect2, >>> perm_mask_high); >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> + (*result_chain)[2*j] = high; >>> + >>> + /* Create interleaving stmt: >>> + low = VEC_PERM_EXPR <vect1, vect2, >>> + {nelt/2, nelt*3/2, nelt/2+1, >>> nelt*3/2+1, >>> + ...}> */ >>> + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); >>> + perm_stmt >>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, >>> + vect1, vect2, >>> perm_mask_low); >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> + (*result_chain)[2*j+1] = low; >>> + } >>> + memcpy (dr_chain.address (), result_chain->address (), >>> + length * sizeof (tree)); >>> + } >>> } >>> } >>> >>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c >>> index b87c143..24d0b94 100644 >>> --- a/gcc/tree-vect-stmts.c >>> +++ b/gcc/tree-vect-stmts.c >>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int >>> ncopies, >>> include the cost of the permutes. */ >>> if (!store_lanes_p && group_size > 1) >>> { >>> - /* Uses a high and low interleave operation for each needed permute. >>> */ >>> - >>> - int nstmts = ncopies * exact_log2 (group_size) * group_size; >>> + /* Uses a high and low interleave or shuffle operations for each >>> + needed permute. */ >>> + int nstmts = ncopies * ceil_log2 (group_size) * group_size; >>> inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, >>> stmt_info, 0, vect_body); >>> >>> >>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c >>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c >>> new file mode 100644 >>> index 0000000..cc1e72e >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c >>> @@ -0,0 +1,21 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 >>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ >>> + >>> +#define byte unsigned char >>> + >>> +void >>> +matrix_mul (byte *in, byte *out, int size) >>> +{ >>> + int i; >>> + for (i = 0; i < size; i++) >>> + { >>> + out[0] = in[0] + in[1] + in[3]; >>> + out[1] = in[0] + in[2] + in[4]; >>> + out[2] = in[1] + in[2] + in[4]; >>> + in += 4; >>> + out += 3; >>> + } >>> +} >>> + >>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ >>> +/* { dg-final { cleanup-tree-dump "vect" } } */