The following implements the simple case of single-lane SLP when using partial vectors which can use the VEC_EXTRACT_LAST code generation without changes. I'll keep the PR open for further enhancements.
This avoids FAILs of gcc.target/aarch64/sve/live_1.c when using single-lane SLP for non-grouped stores. PR tree-optimization/116609 * tree-vect-loop.cc (vectorizable_live_operation_1): Support partial vectors for single-lane SLP. --- gcc/tree-vect-loop.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 242d5e2d916..31cdc4bf53d 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10961,7 +10961,8 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo, where VEC_LHS is the vectorized live-out result and MASK is the loop mask for the final iteration. */ - gcc_assert (ncopies == 1 && !slp_node); + gcc_assert (ncopies == 1 + && (!slp_node || SLP_TREE_LANES (slp_node) == 1)); gimple_seq tem = NULL; gimple_stmt_iterator gsi = gsi_last (tem); tree len = vect_get_loop_len (loop_vinfo, &gsi, @@ -10995,7 +10996,7 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo, where VEC_LHS is the vectorized live-out result and MASK is the loop mask for the final iteration. */ - gcc_assert (!slp_node); + gcc_assert (!slp_node || SLP_TREE_LANES (slp_node) == 1); tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); gimple_seq tem = NULL; gimple_stmt_iterator gsi = gsi_last (tem); @@ -11147,7 +11148,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, /* No transformation required. */ if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) { - if (slp_node) + if (slp_node && SLP_TREE_LANES (slp_node) != 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -11156,7 +11157,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, "the loop.\n"); LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; } - else if (ncopies > 1) + else if (ncopies > 1 + || (slp_node && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -11166,7 +11168,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, } else { - gcc_assert (ncopies == 1 && !slp_node); + gcc_assert (ncopies == 1 + && (!slp_node || SLP_TREE_LANES (slp_node) == 1)); if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, OPTIMIZE_FOR_SPEED)) vect_record_loop_mask (loop_vinfo, @@ -11213,8 +11216,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, if (slp_node) { gcc_assert (!loop_vinfo - || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) - && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); + || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + || SLP_TREE_LANES (slp_node) == 1)); /* Get the correct slp vectorized stmt. */ vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; -- 2.43.0