Thanks for the review. Richard Biener <richard.guent...@gmail.com> writes: >> @@ -588,6 +600,23 @@ public: >> /* Unrolling factor */ >> poly_uint64 vectorization_factor; >> >> + /* If this loop is an epilogue loop whose main loop can be skipped, >> + MAIN_LOOP_EDGE is the edge from the main loop to this loop's >> + preheader. SKIP_MAIN_LOOP_EDGE is then the edge that skips the >> + main loop and goes straight to this loop's preheader. >> + >> + Both fields are null otherwise. */ >> + edge main_loop_edge; >> + edge skip_main_loop_edge; >> + >> + /* If this loop is an epilogue loop that might be skipped after executing >> + the main loop, this edge is the one that skips the epilogue. */ >> + edge skip_this_loop_edge; >> + >> + /* After vectorization, maps live-out SSA names to information about >> + the reductions that generated them. */ >> + hash_map<tree, vect_reusable_accumulator> reusable_accumulators; > > Is that the LC PHI node defs or the definition inside of the loop? > If the latter we could attach the info directly to its stmt-info?
Ah, yeah, I should improve the comment there. It's the vectoriser's replacement for the original LC PHI node, i.e. the final scalar result after the reduction has taken place. >> @@ -1186,6 +1215,21 @@ public: >> /* The vector type for performing the actual reduction. */ >> tree reduc_vectype; >> >> + /* If IS_REDUC_INFO is true and if the reduction is operating on N >> + elements in parallel, this vector gives the initial values of these >> + N elements. */ > > That's N scalar elements or N vector elements? I suppose it's for > SLP reductions (rather than SLP reduction chains) and never non-SLP > reductions? Yeah, poor wording again, sorry. I meant something closer to: /* If IS_REDUC_INFO is true and if the vector code is performing N scalar reductions in parallel, this vector gives the initial scalar values of those N reductions. */ >> + vec<tree> reduc_initial_values; >> + >> + /* If IS_REDUC_INFO is true and if the reduction is operating on N >> + elements in parallel, this vector gives the scalar result of each >> + reduction. */ >> + vec<tree> reduc_scalar_results; Same change here. >> […] >> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c >> index 2909e8a0fc3..b7b0523e3c8 100644 >> --- a/gcc/tree-vect-loop-manip.c >> +++ b/gcc/tree-vect-loop-manip.c >> @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info >> epilogue_vinfo, >> return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true); >> } >> >> +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit >> + from the corresponding main loop. Return a value that is available in >> + LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped. >> + Passing a null SKIP_VALUE is equivalent to passing zero. */ >> + >> +tree >> +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value, >> + tree skip_value) >> +{ >> + if (!loop_vinfo->main_loop_edge) >> + return main_loop_value; >> + >> + if (!skip_value) >> + skip_value = build_zero_cst (TREE_TYPE (main_loop_value)); > > shouldn't that be the initial value? For the current use case, the above two conditions are never true. I wrote it like this because I had a follow-on patch (which might not go anywhere) that needed this function for 0-based IVs. Maybe that's a bad risk/reward trade-off though. Not having to pass zero makes things only slightly simpler for the follow-on patch, and I guess could be dangerous in other cases. Perhaps in that case though I should change loop_vinfo->main_loop_edge into a gcc_assert as well. >> + tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value)); >> + basic_block bb = loop_vinfo->main_loop_edge->dest; >> + gphi *new_phi = create_phi_node (phi_result, bb); >> + add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge, >> + UNKNOWN_LOCATION); >> + add_phi_arg (new_phi, skip_value, >> + loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION); >> + return phi_result; >> +} >> + >> /* Function vect_do_peeling. >> >> Input: >> […] >> @@ -4823,6 +4842,100 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info >> stmt_info) >> return stmt_info; >> } >> >> +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using >> vector >> + type VECTYPE. See if LOOP_VINFO is an epilogue loop whose main loop had >> a >> + matching reduction that we can build on. Adjust REDUC_INFO and return >> true >> + if so, otherwise return false. */ >> + >> +static bool >> +vect_find_reusable_accumulator (loop_vec_info loop_vinfo, >> + stmt_vec_info reduc_info) >> +{ >> + loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); >> + if (!main_loop_vinfo) >> + return false; >> + >> + if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION) >> + return false; >> + >> + unsigned int num_phis = reduc_info->reduc_initial_values.length (); >> + auto_vec<tree, 16> main_loop_results (num_phis); >> + auto_vec<tree, 16> initial_values (num_phis); >> + if (edge main_loop_edge = loop_vinfo->main_loop_edge) >> + { >> + /* The epilogue loop can be entered either from the main loop or >> + from an earlier guard block. */ >> + edge skip_edge = loop_vinfo->skip_main_loop_edge; >> + for (tree incoming_value : reduc_info->reduc_initial_values) >> + { >> + /* Look for: >> + >> + INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop), >> + INITIAL_VALUE(guard block)>. */ >> + gcc_assert (TREE_CODE (incoming_value) == SSA_NAME); >> + >> + gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value)); >> + gcc_assert (gimple_bb (phi) == main_loop_edge->dest); >> + >> + tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge); >> + tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge); >> + >> + main_loop_results.quick_push (from_main_loop); >> + initial_values.quick_push (from_skip); >> + } >> + } >> + else >> + /* The main loop dominates the epilogue loop. */ >> + main_loop_results.splice (reduc_info->reduc_initial_values); >> + >> + /* See if the main loop has the kind of accumulator we need. */ >> + vect_reusable_accumulator *accumulator >> + = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]); >> + if (!accumulator >> + || num_phis != accumulator->reduc_info->reduc_scalar_results.length () >> + || !std::equal (main_loop_results.begin (), main_loop_results.end (), >> + accumulator->reduc_info->reduc_scalar_results.begin >> ())) >> + return false; >> + >> + /* For now, only handle the case in which both loops are operating on the >> + same vector types. In future we could reduce wider vectors to narrower >> + ones as well. */ >> + tree vectype = STMT_VINFO_VECTYPE (reduc_info); >> + tree old_vectype = TREE_TYPE (accumulator->reduc_input); >> + if (!useless_type_conversion_p (old_vectype, vectype)) > > It should be indeed quite trivial to handle, likewise the case where we > have multiple PHIs - just reduce to a single input vector and have the > possibly multiple input vectors in the epilogue filled with neutral > elements. I'll see if I can cook up stuff for this next week. Yeah, agreed. The multi-vector epilogue case should be especially easy to handle, but it's not interesting for SVE as things stand, since: (a) non-SLP reductions use a single cycle for ncopies>1 (a misfeature IMO -- on targets with wide pipelines we want exactly the opposite) (b) SLP reductions are limited to single vectors for variable-length targets. So it wasn't possible to trigger multiple epilogue vectors for the motivating SVE use case. >> […] >> @@ -5196,6 +5305,37 @@ vect_create_epilog_for_reduction (loop_vec_info >> loop_vinfo, >> reduc_inputs.safe_push (single_input); >> } >> >> + tree orig_reduc_input = reduc_inputs[0]; >> + >> + /* If this loop is an epilogue loop that can be skipped after the >> + main loop, we can only share a reduction operation between the >> + main loop and the epilogue if we put it at the target of the >> + skip edge. > > Do you have a testcase where we cannot do this? No, it's being defensive. I wasn't sure how the epilogue code would evolve in future. >> + We can still reuse accumulators if this check fails. Doing so has >> + the minor(?) benefit of making the epilogue loop's scalar result >> + independent of the main loop's scalar result. */ >> + bool unify_with_main_loop_p = false; >> + if (reduc_info->reused_accumulator >> + && loop_vinfo->skip_this_loop_edge >> + && single_succ_p (exit_bb) >> + && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest) >> + { >> + unify_with_main_loop_p = true; >> + >> + basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest; >> + reduc_inputs[0] = make_ssa_name (vectype); >> + gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block); >> + add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb), >> + UNKNOWN_LOCATION); >> + add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input, >> + loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION); >> + exit_gsi = gsi_after_labels (reduc_block); >> + } >> + >> + /* Shouldn't be used beyond this point. */ >> + exit_bb = nullptr; >> + >> if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION >> && reduc_fn != IFN_LAST) >> { >> @@ -5819,6 +5958,12 @@ vect_create_epilog_for_reduction (loop_vec_info >> loop_vinfo, >> scalar_results[0] = new_temp; >> } >> >> + /* Record this operation if it could be reused by the epilogue loop. */ >> + if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION >> + && !double_reduc) > > what's the issue with double_reduc? Probably nothing TBH. I haven't been able to construct a case that uses predicated double reductions with vect-partial-vector-usage=1, but that's probably a missed optimisation. There again, double reductions themselves seem to be hard to trigger now that we have loop interchange. Is there a good way of testing them without -fno-loop-interchange? Thanks, Richard >> + loop_vinfo->reusable_accumulators.put (scalar_results[0], >> + { orig_reduc_input, reduc_info }); >> + >> if (double_reduc) >> loop = outer_loop; >>