https://gcc.gnu.org/g:1f6b1ed047105103c2fba9696fd0ed17ec5e1405
commit r16-4076-g1f6b1ed047105103c2fba9696fd0ed17ec5e1405 Author: Richard Biener <rguent...@suse.de> Date: Wed Sep 24 12:19:17 2025 +0200 tree-optimization/116816 - improve VMAT_ELEMENTWISE with SLP The following implements VMAT_ELEMENTWISE for grouped loads, in particular for being able to serve as fallback for unhandled load permutations since it's trivial to load elements in the correct order. PR tree-optimization/116816 * tree-vect-stmts.cc (get_load_store_type): Allow multi-lane single-element interleaving to fall back to VMAT_ELEMENTWISE. Fall back to VMAT_ELEMENTWISE when we cannot handle a load permutation. (vectorizable_load): Do not check a load permutation for VMAT_ELEMENTWISE. Handle grouped loads with VMAT_ELEMENTWISE and directly apply a load permutation. Diff: --- gcc/tree-vect-stmts.cc | 81 +++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 995151f9a271..cfc4f323a220 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2134,41 +2134,26 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, || *memory_access_type == VMAT_CONTIGUOUS_REVERSE) && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype))) { - if (SLP_TREE_LANES (slp_node) == 1) - { - *memory_access_type = VMAT_ELEMENTWISE; - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads, using " - "elementwise access\n"); - } - else - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads\n"); - return false; - } + *memory_access_type = VMAT_ELEMENTWISE; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "single-element interleaving not supported " + "for not adjacent vector loads, using " + "elementwise access\n"); } - /* For single-element interleaving also fall back to elementwise - access in case we did not lower a permutation and cannot - code generate it. */ + /* Also fall back to elementwise access in case we did not lower a + permutation and cannot code generate it. */ if (loop_vinfo - && single_element_p - && SLP_TREE_LANES (slp_node) == 1 - && (*memory_access_type == VMAT_CONTIGUOUS - || *memory_access_type == VMAT_CONTIGUOUS_REVERSE) + && *memory_access_type != VMAT_ELEMENTWISE && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () && !perm_ok) { *memory_access_type = VMAT_ELEMENTWISE; if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving permutation not " - "supported, using elementwise access\n"); + "permutation not supported, using elementwise " + "access\n"); } overrun_p = (loop_vinfo && gap != 0 @@ -2498,9 +2483,9 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, traditional behavior until that can be fixed. */ if (*memory_access_type == VMAT_ELEMENTWISE && !STMT_VINFO_STRIDED_P (first_stmt_info) - && !(stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info) - && !DR_GROUP_NEXT_ELEMENT (stmt_info) - && !pow2p_hwi (DR_GROUP_SIZE (stmt_info)))) + && !(STMT_VINFO_GROUPED_ACCESS (stmt_info) + && single_element_p + && !pow2p_hwi (group_size))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -9485,11 +9470,11 @@ vectorizable_load (vec_info *vinfo, /* ??? The following checks should really be part of get_load_store_type. */ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists () - && !((memory_access_type == VMAT_ELEMENTWISE - || mat_gather_scatter_p (memory_access_type)) - && SLP_TREE_LANES (slp_node) == 1 - && (!grouped_load - || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))) + && !(memory_access_type == VMAT_ELEMENTWISE + || (mat_gather_scatter_p (memory_access_type) + && SLP_TREE_LANES (slp_node) == 1 + && (!grouped_load + || !DR_GROUP_NEXT_ELEMENT (first_stmt_info))))) { slp_perm = true; @@ -9732,28 +9717,24 @@ vectorizable_load (vec_info *vinfo, { first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); + ref_type = get_group_alias_ptr_type (first_stmt_info); } else { first_stmt_info = stmt_info; first_dr_info = dr_info; + ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr)); } - if (grouped_load && memory_access_type == VMAT_STRIDED_SLP) + if (grouped_load) { - group_size = DR_GROUP_SIZE (first_stmt_info); - ref_type = get_group_alias_ptr_type (first_stmt_info); + if (memory_access_type == VMAT_STRIDED_SLP) + group_size = DR_GROUP_SIZE (first_stmt_info); + else /* VMAT_ELEMENTWISE */ + group_size = SLP_TREE_LANES (slp_node); } else - { - if (grouped_load) - cst_offset - = (tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))) - * vect_get_place_in_interleaving_chain (stmt_info, - first_stmt_info)); - group_size = 1; - ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr)); - } + group_size = 1; if (!costing_p) { @@ -9892,6 +9873,7 @@ vectorizable_load (vec_info *vinfo, int ncopies; if (slp_perm) { + gcc_assert (memory_access_type != VMAT_ELEMENTWISE); /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for variable VF. */ unsigned int const_vf = vf.to_constant (); @@ -9927,8 +9909,13 @@ vectorizable_load (vec_info *vinfo, slp_node, 0, vect_body); continue; } + unsigned int load_el = group_el; + /* For elementwise accesses apply a load permutation directly. */ + if (memory_access_type == VMAT_ELEMENTWISE + && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) + load_el = SLP_TREE_LOAD_PERMUTATION (slp_node)[group_el]; tree this_off = build_int_cst (TREE_TYPE (alias_off), - group_el * elsz + cst_offset); + load_el * elsz + cst_offset); tree data_ref = build2 (MEM_REF, ltype, running_off, this_off); vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); new_temp = make_ssa_name (ltype);