The following extends SLP discovery to handle non-grouped loads in loop vectorization in the case the same load appears in all lanes.
Code generation is adjusted to mimick what we do for the case of single element interleaving (when the load is not unit-stride) which is already handled by SLP. There are some limits we run into because peeling for gap cannot cover all cases and we choose VMAT_CONTIGUOUS. The patch does not try to address these issues yet. The main obstacle is that these loads are not STMT_VINFO_GROUPED_ACCESS and that's a new thing with SLP. I know from the past that it's not a good idea to make them grouped. Instead the following massages places to deal with SLP loads that are not STMT_VINFO_GROUPED_ACCESS. There's already a testcase testing for the case the PR is after, just XFAILed, the following adjusts that instead of adding another. I do expect to have missed some so I don't plan to push this on a Friday. Still there may be feedback, so posting this now. Bootstrapped and tested on x86_64-unknown-linux-gnu. PR tree-optimization/96208 * tree-vect-slp.cc (vect_build_slp_tree_1): Allow a non-grouped load if it is the same for all lanes. (vect_build_slp_tree_2): Handle not grouped loads. (vect_optimize_slp_pass::remove_redundant_permutations): Likewise. (vect_transform_slp_perm_load_1): Likewise. * tree-vect-stmts.cc (vect_model_load_cost): Likewise. (get_group_load_store_type): Likewise. Handle invariant accesses. (vectorizable_load): Likewise. * gcc.dg/vect/slp-46.c: Adjust for new vectorizations. * gcc.dg/vect/bb-slp-pr65935.c: Adjust. --- gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c | 16 ++- gcc/testsuite/gcc.dg/vect/slp-46.c | 2 +- gcc/tree-vect-slp.cc | 51 +++++--- gcc/tree-vect-stmts.cc | 128 +++++++++++++-------- 4 files changed, 127 insertions(+), 70 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c index ee121364910..8cefa7f52af 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c @@ -24,11 +24,17 @@ void rephase (void) struct site *s; for(i=0,s=lattice;i<sites_on_node;i++,s++) for(dir=0;dir<32;dir++) - for(j=0;j<3;j++)for(k=0;k<3;k++) - { - s->link[dir].e[j][k].real *= s->phase[dir]; - s->link[dir].e[j][k].imag *= s->phase[dir]; - } + { + for(j=0;j<3;j++) + for(k=0;k<3;k++) + { + s->link[dir].e[j][k].real *= s->phase[dir]; + s->link[dir].e[j][k].imag *= s->phase[dir]; + } + /* Avoid loop vectorizing the outer loop after unrolling + the inners. */ + __asm__ volatile ("" : : : "memory"); + } } int main() diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c index 18476a43d3f..79ed0bb9f6b 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-46.c +++ b/gcc/testsuite/gcc.dg/vect/slp-46.c @@ -94,4 +94,4 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index ab89a82f1b3..4481d43e3d7 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1286,15 +1286,19 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, { if (load_p && rhs_code != CFN_GATHER_LOAD - && rhs_code != CFN_MASK_GATHER_LOAD) + && rhs_code != CFN_MASK_GATHER_LOAD + /* Not grouped loads are handled as externals for BB + vectorization. For loop vectorization we can handle + splats the same we handle single element interleaving. */ + && (is_a <bb_vec_info> (vinfo) + || stmt_info != first_stmt_info)) { /* Not grouped load. */ if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "Build SLP failed: not grouped load %G", stmt); - /* FORNOW: Not grouped loads are not supported. */ - if (is_a <bb_vec_info> (vinfo) && i != 0) + if (i != 0) continue; /* Fatal mismatch. */ matches[0] = false; @@ -1302,7 +1306,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } /* Not memory operation. */ - if (!phi_p + if (!load_p + && !phi_p && rhs_code.is_tree_code () && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary @@ -1774,7 +1779,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, return NULL; /* If the SLP node is a load, terminate the recursion unless masked. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info) + if (STMT_VINFO_DATA_REF (stmt_info) && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) { if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) @@ -1798,8 +1803,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) { - int load_place = vect_get_place_in_interleaving_chain - (load_info, first_stmt_info); + int load_place; + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + load_place = vect_get_place_in_interleaving_chain + (load_info, first_stmt_info); + else + load_place = 0; gcc_assert (load_place != -1); load_permutation.safe_push (load_place); } @@ -5425,6 +5434,16 @@ vect_optimize_slp_pass::remove_redundant_permutations () this_load_permuted = true; break; } + /* When this isn't a grouped access we know it's single element + and contiguous. */ + if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0])) + { + if (!this_load_permuted + && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U) + || SLP_TREE_LANES (node) == 1)) + SLP_TREE_LOAD_PERMUTATION (node).release (); + continue; + } stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); if (!this_load_permuted @@ -8115,12 +8134,16 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, tree vectype = SLP_TREE_VECTYPE (node); unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length (); unsigned int mask_element; + unsigned dr_group_size; machine_mode mode; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) - return false; - - stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + dr_group_size = 1; + else + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + dr_group_size = DR_GROUP_SIZE (stmt_info); + } mode = TYPE_MODE (vectype); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); @@ -8161,7 +8184,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, unsigned int nelts_to_build; unsigned int nvectors_per_build; unsigned int in_nlanes; - bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) + bool repeating_p = (group_size == dr_group_size && multiple_p (nunits, group_size)); if (repeating_p) { @@ -8174,7 +8197,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, it at least one to ensure the later computation for n_perms proceed. */ nvectors_per_build = nstmts > 0 ? nstmts : 1; - in_nlanes = DR_GROUP_SIZE (stmt_info) * 3; + in_nlanes = dr_group_size * 3; } else { @@ -8186,7 +8209,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, mask.new_vector (const_nunits, const_nunits, 1); nelts_to_build = const_vf * group_size; nvectors_per_build = 1; - in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info); + in_nlanes = const_vf * dr_group_size; } auto_sbitmap used_in_lanes (in_nlanes); bitmap_clear (used_in_lanes); @@ -8200,7 +8223,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, { unsigned int iter_num = j / group_size; unsigned int stmt_num = j % group_size; - unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]); + unsigned int i = (iter_num * dr_group_size + perm[stmt_num]); bitmap_set_bit (used_in_lanes, i); if (repeating_p) { diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index e6649789540..b1b08238dc3 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1150,6 +1150,8 @@ vect_model_load_cost (vec_info *vinfo, /* If the load is permuted then the alignment is determined by the first group element not by the first scalar stmt DR. */ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (!first_stmt_info) + first_stmt_info = stmt_info; /* Record the cost for the permutation. */ unsigned n_perms, n_loads; vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, @@ -2204,12 +2206,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, { loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; - stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + stmt_vec_info first_stmt_info; + unsigned int group_size; + unsigned HOST_WIDE_INT gap; + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + { + first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + group_size = DR_GROUP_SIZE (first_stmt_info); + gap = DR_GROUP_GAP (first_stmt_info); + } + else + { + first_stmt_info = stmt_info; + group_size = 1; + gap = 0; + } dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); - unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); bool single_element_p = (stmt_info == first_stmt_info && !DR_GROUP_NEXT_ELEMENT (stmt_info)); - unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); /* True if the vectorized statements would access beyond the last @@ -2312,11 +2326,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, *memory_access_type = VMAT_ELEMENTWISE; } } - else + else if (cmp == 0 && loop_vinfo) { - gcc_assert (!loop_vinfo || cmp > 0); - *memory_access_type = VMAT_CONTIGUOUS; + gcc_assert (vls_type == VLS_LOAD); + *memory_access_type = VMAT_INVARIANT; + /* Invariant accesses perform only component accesses, alignment + is irrelevant for them. */ + *alignment_support_scheme = dr_unaligned_supported; } + else + *memory_access_type = VMAT_CONTIGUOUS; /* When we have a contiguous access across loop iterations but the access in the loop doesn't cover the full vector @@ -2541,7 +2560,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, is irrelevant for them. */ *alignment_support_scheme = dr_unaligned_supported; } - else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + else if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node) { if (!get_group_load_store_type (vinfo, stmt_info, vectype, slp_node, masked_p, @@ -9408,46 +9427,6 @@ vectorizable_load (vec_info *vinfo, return false; } - if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) - { - slp_perm = true; - - if (!loop_vinfo) - { - /* In BB vectorization we may not actually use a loaded vector - accessing elements in excess of DR_GROUP_SIZE. */ - stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; - group_info = DR_GROUP_FIRST_ELEMENT (group_info); - unsigned HOST_WIDE_INT nunits; - unsigned j, k, maxk = 0; - FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k) - if (k > maxk) - maxk = k; - tree vectype = SLP_TREE_VECTYPE (slp_node); - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) - || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "BB vectorization with gaps at the end of " - "a load is not supported\n"); - return false; - } - } - - auto_vec<tree> tem; - unsigned n_perms; - if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf, - true, &n_perms)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "unsupported load permutation\n"); - return false; - } - } - /* Invalidate assumptions made by dependence analysis when vectorization on the unrolled body effectively re-orders stmts. */ if (!PURE_SLP_STMT (stmt_info) @@ -9465,6 +9444,46 @@ vectorizable_load (vec_info *vinfo, else group_size = 1; + if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) + { + slp_perm = true; + + if (!loop_vinfo) + { + /* In BB vectorization we may not actually use a loaded vector + accessing elements in excess of DR_GROUP_SIZE. */ + stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; + group_info = DR_GROUP_FIRST_ELEMENT (group_info); + unsigned HOST_WIDE_INT nunits; + unsigned j, k, maxk = 0; + FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k) + if (k > maxk) + maxk = k; + tree vectype = SLP_TREE_VECTYPE (slp_node); + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) + || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "BB vectorization with gaps at the end of " + "a load is not supported\n"); + return false; + } + } + + auto_vec<tree> tem; + unsigned n_perms; + if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf, + true, &n_perms)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "unsupported load permutation\n"); + return false; + } + } + vect_memory_access_type memory_access_type; enum dr_alignment_support alignment_support_scheme; int misalignment; @@ -9842,10 +9861,19 @@ vectorizable_load (vec_info *vinfo, || (!slp && memory_access_type == VMAT_CONTIGUOUS)) grouped_load = false; - if (grouped_load) + if (grouped_load + || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())) { - first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - group_size = DR_GROUP_SIZE (first_stmt_info); + if (grouped_load) + { + first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + group_size = DR_GROUP_SIZE (first_stmt_info); + } + else + { + first_stmt_info = stmt_info; + group_size = 1; + } /* For SLP vectorization we directly vectorize a subchain without permutation. */ if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) -- 2.35.3