https://gcc.gnu.org/g:b1c7095a1da11d2543222d98243d10f9cc9823ce
commit r15-3916-gb1c7095a1da11d2543222d98243d10f9cc9823ce Author: Richard Biener <rguent...@suse.de> Date: Mon Sep 23 15:24:01 2024 +0200 tree-optimization/116818 - try VMAT_GATHER_SCATTER also for SLP When not doing SLP and we end up with VMAT_ELEMENTWISE we consider using strided loads, aka VMAT_GATHER_SCATTER. The following moves this logic down to also apply to SLP where we now can end up using VMAT_ELEMENTWISE as well. PR tree-optimization/116818 * tree-vect-stmts.cc (get_group_load_store_type): Consider VMAT_GATHER_SCATTER instead of VMAT_ELEMENTWISE also for SLP. (vectorizable_load): For single-lane VMAT_GATHER_SCATTER also ignore permutations. Diff: --- gcc/tree-vect-stmts.cc | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index a8031b4f6f5e..0e75e3b49567 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2260,21 +2260,21 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, } } } + } - /* As a last resort, trying using a gather load or scatter store. + /* As a last resort, trying using a gather load or scatter store. - ??? Although the code can handle all group sizes correctly, - it probably isn't a win to use separate strided accesses based - on nearby locations. Or, even if it's a win over scalar code, - it might not be a win over vectorizing at a lower VF, if that - allows us to use contiguous accesses. */ - if (*memory_access_type == VMAT_ELEMENTWISE - && single_element_p - && loop_vinfo - && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo, - masked_p, gs_info)) - *memory_access_type = VMAT_GATHER_SCATTER; - } + ??? Although the code can handle all group sizes correctly, + it probably isn't a win to use separate strided accesses based + on nearby locations. Or, even if it's a win over scalar code, + it might not be a win over vectorizing at a lower VF, if that + allows us to use contiguous accesses. */ + if (*memory_access_type == VMAT_ELEMENTWISE + && single_element_p + && loop_vinfo + && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo, + masked_p, gs_info)) + *memory_access_type = VMAT_GATHER_SCATTER; if (*memory_access_type == VMAT_GATHER_SCATTER || *memory_access_type == VMAT_ELEMENTWISE) @@ -10063,7 +10063,8 @@ vectorizable_load (vec_info *vinfo, get_group_load_store_type. */ if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () - && !(memory_access_type == VMAT_ELEMENTWISE + && !((memory_access_type == VMAT_ELEMENTWISE + || memory_access_type == VMAT_GATHER_SCATTER) && SLP_TREE_LANES (slp_node) == 1)) { slp_perm = true;