The following removes the non-SLP store interleaving support which was already almost unused.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. * tree-vectorizer.h (vect_grouped_store_supported): Remove. (vect_permute_store_chain): Likewise. * tree-vect-data-refs.cc (vect_grouped_store_supported): Remove. (vect_permute_store_chain): Likewise. * tree-vect-stmts.cc (vectorizable_store): Remove comment about store interleaving. * tree-vect-loop.cc (vect_analyze_loop_2): Do not consider store interleaving when disregarding single-lane SLP. --- gcc/tree-vect-data-refs.cc | 318 ------------------------------------- gcc/tree-vect-loop.cc | 3 +- gcc/tree-vect-stmts.cc | 33 ---- gcc/tree-vectorizer.h | 4 - 4 files changed, 1 insertion(+), 357 deletions(-) diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index b38eecd7901..27be3202fec 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -5954,126 +5954,6 @@ vect_create_destination_var (tree scalar_dest, tree vectype) return vec_dest; } -/* Function vect_grouped_store_supported. - - Returns TRUE if interleave high and interleave low permutations - are supported, and FALSE otherwise. */ - -bool -vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) -{ - machine_mode mode = TYPE_MODE (vectype); - - /* vect_permute_store_chain requires the group size to be equal to 3 or - be a power of two. */ - if (count != 3 && exact_log2 (count) == -1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2 or not eqaul to 3\n"); - return false; - } - - /* Check that the permutation is supported. */ - if (VECTOR_MODE_P (mode)) - { - unsigned int i; - if (count == 3) - { - unsigned int j0 = 0, j1 = 0, j2 = 0; - unsigned int i, j; - - unsigned int nelt; - if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "cannot handle groups of 3 stores for" - " variable-length vectors\n"); - return false; - } - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - for (j = 0; j < 3; j++) - { - int nelt0 = ((3 - j) * nelt) % 3; - int nelt1 = ((3 - j) * nelt + 1) % 3; - int nelt2 = ((3 - j) * nelt + 2) % 3; - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = j0++; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = nelt + j1++; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = 0; - } - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (mode, mode, indices)) - { - if (dump_enabled_p ()) - dump_printf (MSG_MISSED_OPTIMIZATION, - "permutation op not supported by target.\n"); - return false; - } - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = 3 * i + nelt0; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = 3 * i + nelt1; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = nelt + j2++; - } - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (mode, mode, indices)) - { - if (dump_enabled_p ()) - dump_printf (MSG_MISSED_OPTIMIZATION, - "permutation op not supported by target.\n"); - return false; - } - } - return true; - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (count)); - poly_uint64 nelt = GET_MODE_NUNITS (mode); - - /* The encoding has 2 interleaved stepped patterns. */ - if(!multiple_p (nelt, 2)) - return false; - vec_perm_builder sel (nelt, 2, 3); - sel.quick_grow (6); - for (i = 0; i < 3; i++) - { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; - } - vec_perm_indices indices (sel, 2, nelt); - if (can_vec_perm_const_p (mode, mode, indices)) - { - for (i = 0; i < 6; i++) - sel[i] += exact_div (nelt, 2); - indices.new_vector (sel, 2, nelt); - if (can_vec_perm_const_p (mode, mode, indices)) - return true; - } - } - } - - if (dump_enabled_p ()) - dump_printf (MSG_MISSED_OPTIMIZATION, - "permutation op not supported by target.\n"); - return false; -} - /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors of type VECTYPE. MASKED_P says whether the masked form is needed. */ @@ -6102,204 +5982,6 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, } -/* Function vect_permute_store_chain. - - Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate interleave_high/low stmts to reorder - the data correctly for the stores. Return the final references for stores - in RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to - each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 8 16 24 1 9 17 25 - 2nd vec: 2 10 18 26 3 11 19 27 - 3rd vec: 4 12 20 28 5 13 21 30 - 4th vec: 6 14 22 30 7 15 23 31 - - i.e., we interleave the contents of the four vectors in their order. - - We use interleave_high/low instructions to create such output. The input of - each interleave_high/low operation is two vectors: - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - the even elements of the result vector are obtained left-to-right from the - high/low elements of the first vector. The odd elements of the result are - obtained left-to-right from the high/low elements of the second vector. - The output of interleave_high will be: 0 4 1 5 - and of interleave_low: 2 6 3 7 - - - The permutation is done in log LENGTH stages. In each stage interleave_high - and interleave_low stmts are created for each pair of vectors in DR_CHAIN, - where the first argument is taken from the first half of DR_CHAIN and the - second argument from it's second half. - In our example, - - I1: interleave_high (1st vec, 3rd vec) - I2: interleave_low (1st vec, 3rd vec) - I3: interleave_high (2nd vec, 4th vec) - I4: interleave_low (2nd vec, 4th vec) - - The output for the first stage is: - - I1: 0 16 1 17 2 18 3 19 - I2: 4 20 5 21 6 22 7 23 - I3: 8 24 9 25 10 26 11 27 - I4: 12 28 13 29 14 30 15 31 - - The output of the second stage, i.e. the final result is: - - I1: 0 8 16 24 1 9 17 25 - I2: 2 10 18 26 3 11 19 27 - I3: 4 12 20 28 5 13 21 30 - I4: 6 14 22 30 7 15 23 31. */ - -void -vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect1, vect2, high, low; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - tree perm_mask_low, perm_mask_high; - tree data_ref; - tree perm3_mask_low, perm3_mask_high; - unsigned int i, j, n, log_length = exact_log2 (length); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - /* vect_grouped_store_supported ensures that this is constant. */ - unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); - unsigned int j0 = 0, j1 = 0, j2 = 0; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - for (j = 0; j < 3; j++) - { - int nelt0 = ((3 - j) * nelt) % 3; - int nelt1 = ((3 - j) * nelt + 1) % 3; - int nelt2 = ((3 - j) * nelt + 2) % 3; - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = j0++; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = nelt + j1++; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = 0; - } - indices.new_vector (sel, 2, nelt); - perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = 3 * i + nelt0; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = 3 * i + nelt1; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = nelt + j2++; - } - indices.new_vector (sel, 2, nelt); - perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - vect1 = dr_chain[0]; - vect2 = dr_chain[1]; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {j, nelt, *, j + 1, nelt + j + 1, *, - j + 2, nelt + j + 2, *, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - - vect1 = data_ref; - vect2 = dr_chain[2]; - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {0, 1, nelt + j, 3, 4, nelt + j + 1, - 6, 7, nelt + j + 2, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - /* The encoding has 2 interleaved stepped patterns. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, 2, 3); - sel.quick_grow (6); - for (i = 0; i < 3; i++) - { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; - } - vec_perm_indices indices (sel, 2, nelt); - perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < 6; i++) - sel[i] += exact_div (nelt, 2); - indices.new_vector (sel, 2, nelt); - perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0, n = log_length; i < n; i++) - { - for (j = 0; j < length/2; j++) - { - vect1 = dr_chain[j]; - vect2 = dr_chain[j+length/2]; - - /* Create interleaving stmt: - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, - ...}> */ - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); - perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, - vect2, perm_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[2*j] = high; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, - ...}> */ - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); - perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, - vect2, perm_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[2*j+1] = low; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index cb315e6bbf9..cbbe613930c 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2883,8 +2883,7 @@ again: unsigned int size = DR_GROUP_SIZE (vinfo); tree vectype = STMT_VINFO_VECTYPE (vinfo); if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST - && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) - && ! vect_grouped_store_supported (vectype, size)) + && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)) return opt_result::failure_at (vinfo->stmt, "unsupported grouped store\n"); FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 3b8b98978d3..7a115dbcfcb 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -8413,39 +8413,6 @@ vectorizable_store (vec_info *vinfo, more than one vector stmt - i.e - we need to "unroll" the vector stmt by a factor VF/nunits. */ - /* In case of interleaving (non-unit grouped access): - - S1: &base + 2 = x2 - S2: &base = x0 - S3: &base + 1 = x1 - S4: &base + 3 = x3 - - We create vectorized stores starting from base address (the access of the - first stmt in the chain (S2 in the above example), when the last store stmt - of the chain (S4) is reached: - - VS1: &base = vx2 - VS2: &base + vec_size*1 = vx0 - VS3: &base + vec_size*2 = vx1 - VS4: &base + vec_size*3 = vx3 - - Then permutation statements are generated: - - VS5: vx5 = VEC_PERM_EXPR < vx0, vx3, {0, 8, 1, 9, 2, 10, 3, 11} > - VS6: vx6 = VEC_PERM_EXPR < vx0, vx3, {4, 12, 5, 13, 6, 14, 7, 15} > - ... - - And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts - (the order of the data-refs in the output of vect_permute_store_chain - corresponds to the order of scalar stmts in the interleaving chain - see - the documentation of vect_permute_store_chain()). - - In case of both multiple types and interleaving, above vector stores and - permutation stmts are created for every copy. The result vector stmts are - put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding - STMT_VINFO_RELATED_STMT for the next copies. - */ - auto_vec<tree> dr_chain (group_size); auto_vec<tree> vec_masks; tree vec_mask = NULL; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 082e27c04d4..c58b9c2328b 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2567,13 +2567,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple *, gimple_stmt_iterator *, stmt_vec_info, tree); extern void vect_copy_ref_info (tree, tree); extern tree vect_create_destination_var (tree, tree); -extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT); extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool, vec<int> * = nullptr); -extern void vect_permute_store_chain (vec_info *, vec<tree> &, - unsigned int, stmt_vec_info, - gimple_stmt_iterator *, vec<tree> *); extern tree vect_setup_realignment (vec_info *, stmt_vec_info, gimple_stmt_iterator *, tree *, enum dr_alignment_support, tree, -- 2.43.0