https://gcc.gnu.org/g:32b1be7eb434addefe203249746dc6bbdc185403
commit r16-2785-g32b1be7eb434addefe203249746dc6bbdc185403 Author: Richard Biener <rguent...@suse.de> Date: Mon Aug 4 14:45:53 2025 +0200 tree-optimization/121395 - SLP of SIMD calls w/o LHS The following records the alternate SLP instance entries coming from stmts with stores that have no SSA def, like OMP SIMD calls without LHS. There's a bit of fallout with having a SLP tree with a NULL vectype, but nothing too gross. PR tree-optimization/121395 * tree-vectorizer.h (_loop_vec_info::alternate_defs): New member. (LOOP_VINFO_ALTERNATE_DEFS): New. * tree-vect-stmts.cc (vect_stmt_relevant_p): Populate it. (vectorizable_simd_clone_call): Do not register a SLP def when there is none. * tree-vect-slp.cc (vect_build_slp_tree_1): Allow a NULL vectype when there's no LHS. Allow all calls w/o LHS. (vect_analyze_slp): Process LOOP_VINFO_ALTERNATE_DEFS as SLP graph entries. (vect_make_slp_decision): Handle a NULL SLP_TREE_VECTYPE. (vect_slp_analyze_node_operations_1): Likewise. (vect_schedule_slp_node): Likewise. * gcc.dg/vect/pr59984.c: Adjust. Diff: --- gcc/testsuite/gcc.dg/vect/pr59984.c | 4 ++++ gcc/tree-vect-slp.cc | 37 ++++++++++++++++++++++++++++--------- gcc/tree-vect-stmts.cc | 6 +++++- gcc/tree-vectorizer.h | 5 +++++ 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr59984.c b/gcc/testsuite/gcc.dg/vect/pr59984.c index c00c22671586..8ca446ea67c6 100644 --- a/gcc/testsuite/gcc.dg/vect/pr59984.c +++ b/gcc/testsuite/gcc.dg/vect/pr59984.c @@ -64,3 +64,7 @@ main () return 0; } +/* { dg-final { scan-tree-dump "31:17: optimized: loop vectorized" "vect" } } */ +/* { dg-final { scan-tree-dump "37:7: optimized: loop vectorized" "vect" } } */ +/* { dg-final { scan-tree-dump "44:17: optimized: loop vectorized" "vect" } } */ +/* { dg-final { scan-tree-dump "50:7: optimized: loop vectorized" "vect" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index b46989505e04..2530216e9f18 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1140,7 +1140,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, soft_fail_nunits_vectype = nunits_vectype; } - gcc_assert (vectype); + gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt)); *node_vectype = vectype; /* For every stmt in NODE find its def stmt/s. */ @@ -1187,10 +1187,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, gcall *call_stmt = dyn_cast <gcall *> (stmt); tree lhs = gimple_get_lhs (stmt); - if (lhs == NULL_TREE - && (!call_stmt - || !gimple_call_internal_p (stmt) - || !internal_store_fn_p (gimple_call_internal_fn (stmt)))) + if (lhs == NULL_TREE && !call_stmt) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -4917,6 +4914,22 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, return opt_result::failure_at (vect_location, "SLP build failed.\n"); } + + stmt_vec_info stmt_info; + FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info) + { + vec<stmt_vec_info> stmts; + vec<stmt_vec_info> roots = vNULL; + vec<tree> remain = vNULL; + stmts.create (1); + stmts.quick_push (stmt_info); + if (! vect_build_slp_instance (vinfo, slp_inst_kind_store, + stmts, roots, remain, max_tree_size, + &limit, bst_map, NULL, + force_single_lane)) + return opt_result::failure_at (vect_location, + "SLP build failed.\n"); + } } if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo)) @@ -7634,7 +7647,8 @@ vect_make_slp_decision (loop_vec_info loop_vinfo) /* If all instances ended up with vector(1) T roots make sure to not vectorize. RVV for example relies on loop vectorization when some instances are essentially kept scalar. See PR121048. */ - if (known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U)) + if (SLP_TREE_VECTYPE (root) + && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U)) decided_to_slp++; } @@ -7961,7 +7975,10 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, elements in a vector. For single-defuse-cycle, lane-reducing op, and PHI statement that starts reduction comprised of only lane-reducing ops, the number is more than effective vector statements actually required. */ - SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node); + if (SLP_TREE_VECTYPE (node)) + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node); + else + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; /* Handle purely internal nodes. */ if (SLP_TREE_CODE (node) == VEC_PERM_EXPR) @@ -11318,8 +11335,10 @@ vect_schedule_slp_node (vec_info *vinfo, stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node); - gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); - SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); + gcc_assert (!SLP_TREE_VECTYPE (node) + || SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); + if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0) + SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); if (SLP_TREE_CODE (node) != VEC_PERM_EXPR && STMT_VINFO_DATA_REF (stmt_info)) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 6a6f2ccbbcef..f7a052b6660e 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -386,6 +386,9 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, dump_printf_loc (MSG_NOTE, vect_location, "vec_stmt_relevant_p: stmt has vdefs.\n"); *relevant = vect_used_in_scope; + if (! STMT_VINFO_DATA_REF (stmt_info) + && zero_ssa_operands (stmt_info->stmt, SSA_OP_DEF)) + LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo).safe_push (stmt_info); } /* uses outside the loop. */ @@ -4752,7 +4755,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, } } - SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt)); + if (gimple_get_lhs (new_stmt)) + SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt)); } for (i = 0; i < nargs; ++i) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index c8482c2b4a67..5b0eed3c46d0 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -947,6 +947,10 @@ public: stmt in the chain. */ auto_vec<stmt_vec_info> reduction_chains; + /* Defs that could not be analyzed such as OMP SIMD calls without + a LHS. */ + auto_vec<stmt_vec_info> alternate_defs; + /* Cost vector for a single scalar iteration. */ auto_vec<stmt_info_for_cost> scalar_cost_vec; @@ -1186,6 +1190,7 @@ public: #define LOOP_VINFO_INNER_LOOP_COST_FACTOR(L) (L)->inner_loop_cost_factor #define LOOP_VINFO_INV_PATTERN_DEF_SEQ(L) (L)->inv_pattern_def_seq #define LOOP_VINFO_DRS_ADVANCED_BY(L) (L)->drs_advanced_by +#define LOOP_VINFO_ALTERNATE_DEFS(L) (L)->alternate_defs #define LOOP_VINFO_FULLY_MASKED_P(L) \ (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \