This patch splits the SVE-specific part of aarch64_adjust_body_cost out into its own subroutine, so that a future patch can call it more than once. I wondered about using a lambda to avoid having to pass all the arguments, but in the end this way seemed clearer.
gcc/ * config/aarch64/aarch64.c (aarch64_adjust_body_cost_sve): New function, split out from... (aarch64_adjust_body_cost): ...here. --- gcc/config/aarch64/aarch64.c | 220 ++++++++++++++++++++--------------- 1 file changed, 127 insertions(+), 93 deletions(-) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 17fcb34b2c8..b14b6f22aec 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15488,6 +15488,126 @@ aarch64_estimate_min_cycles_per_iter return cycles; } +/* Subroutine of aarch64_adjust_body_cost for handling SVE. + Use ISSUE_INFO to work out how fast the SVE code can be issued and compare + it to the equivalent value for scalar code (SCALAR_CYCLES_PER_ITER). + If COULD_USE_ADVSIMD is true, also compare it to the issue rate of + Advanced SIMD code (ADVSIMD_CYCLES_PER_ITER). + + COSTS is as for aarch64_adjust_body_cost. ORIG_BODY_COST is the cost + originally passed to aarch64_adjust_body_cost and *BODY_COST is the current + value of the adjusted cost. *SHOULD_DISPARAGE is true if we think the loop + body is too expensive. */ + +static fractional_cost +aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs, + const aarch64_vec_issue_info *issue_info, + fractional_cost scalar_cycles_per_iter, + fractional_cost advsimd_cycles_per_iter, + bool could_use_advsimd, + unsigned int orig_body_cost, + unsigned int *body_cost, + bool *should_disparage) +{ + /* Estimate the minimum number of cycles per iteration needed to issue + non-predicate operations. */ + fractional_cost sve_nonpred_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, + issue_info->sve); + + /* Separately estimate the minimum number of cycles per iteration needed + to issue the predicate operations. */ + fractional_cost sve_pred_issue_cycles_per_iter + = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; + + /* Calculate the overall limit on the number of cycles per iteration. */ + fractional_cost sve_cycles_per_iter + = std::max (sve_nonpred_cycles_per_iter, sve_pred_issue_cycles_per_iter); + + if (dump_enabled_p ()) + { + costs->sve_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %f\n", + sve_cycles_per_iter.as_double ()); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration for non-predicate" + " operations = %f\n", + sve_nonpred_cycles_per_iter.as_double ()); + if (costs->sve_ops.pred_ops) + dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" + " iteration for predicate operations = %d\n", + sve_pred_issue_cycles_per_iter.as_double ()); + } + + /* If the scalar version of the loop could issue at least as + quickly as the predicate parts of the SVE loop, make the SVE loop + prohibitively expensive. In this case vectorization is adding an + overhead that the original scalar code didn't have. + + This is mostly intended to detect cases in which WHILELOs dominate + for very tight loops, which is something that normal latency-based + costs would not model. Adding this kind of cliffedge would be + too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; + code in the caller handles that case in a more conservative way. */ + fractional_cost sve_estimate = sve_pred_issue_cycles_per_iter + 1; + if (scalar_cycles_per_iter < sve_estimate) + { + unsigned int min_cost + = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR); + if (*body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because the" + " scalar code could issue within the limit" + " imposed by predicate operations\n", + min_cost); + *body_cost = min_cost; + *should_disparage = true; + } + } + + /* If it appears that the Advanced SIMD version of a loop could issue + more quickly than the SVE one, increase the SVE cost in proportion + to the difference. The intention is to make Advanced SIMD preferable + in cases where an Advanced SIMD version exists, without increasing + the costs so much that SVE won't be used at all. + + The reasoning is similar to the scalar vs. predicate comparison above: + if the issue rate of the SVE code is limited by predicate operations + (i.e. if sve_pred_issue_cycles_per_iter > sve_nonpred_cycles_per_iter), + and if the Advanced SIMD code could issue within the limit imposed + by the predicate operations, the predicate operations are adding an + overhead that the original code didn't have and so we should prefer + the Advanced SIMD version. However, if the predicate operations + do not dominate in this way, we should only increase the cost of + the SVE code if sve_cycles_per_iter is strictly greater than + advsimd_cycles_per_iter. Given rounding effects, this should mean + that Advanced SIMD is either better or at least no worse. */ + if (sve_nonpred_cycles_per_iter >= sve_pred_issue_cycles_per_iter) + sve_estimate = sve_cycles_per_iter; + if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) + { + /* This ensures that min_cost > orig_body_cost * 2. */ + unsigned int factor = fractional_cost::scale (1, sve_estimate, + advsimd_cycles_per_iter); + unsigned int min_cost = orig_body_cost * factor + 1; + if (*body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because Advanced" + " SIMD code could issue as quickly\n", + min_cost); + *body_cost = min_cost; + *should_disparage = true; + } + } + + return sve_cycles_per_iter; +} + /* BODY_COST is the cost of a vector loop body recorded in COSTS. Adjust the cost as necessary and return the new cost. */ static unsigned int @@ -15583,101 +15703,15 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve) { - /* Estimate the minimum number of cycles per iteration needed to issue - non-predicate operations. */ - fractional_cost sve_cycles_per_iter - = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, - issue_info->sve); - - /* Separately estimate the minimum number of cycles per iteration needed - to issue the predicate operations. */ - fractional_cost pred_cycles_per_iter - = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); - costs->sve_ops.dump (); - dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration for non-predicate" - " operations = %f\n", - sve_cycles_per_iter.as_double ()); - if (costs->sve_ops.pred_ops) - dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" - " iteration for predicate operations = %d\n", - pred_cycles_per_iter.as_double ()); - } - - vector_cycles_per_iter = std::max (sve_cycles_per_iter, - pred_cycles_per_iter); + dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); vector_reduction_latency = costs->sve_ops.reduction_latency; - - /* If the scalar version of the loop could issue at least as - quickly as the predicate parts of the SVE loop, make the SVE loop - prohibitively expensive. In this case vectorization is adding an - overhead that the original scalar code didn't have. - - This is mostly intended to detect cases in which WHILELOs dominate - for very tight loops, which is something that normal latency-based - costs would not model. Adding this kind of cliffedge would be - too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; - code later in the function handles that case in a more - conservative way. */ - fractional_cost sve_estimate = pred_cycles_per_iter + 1; - if (scalar_cycles_per_iter < sve_estimate) - { - unsigned int min_cost - = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR); - if (body_cost < min_cost) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Increasing body cost to %d because the" - " scalar code could issue within the limit" - " imposed by predicate operations\n", - min_cost); - body_cost = min_cost; - should_disparage = true; - } - } - - /* If it appears that the Advanced SIMD version of a loop could issue - more quickly than the SVE one, increase the SVE cost in proportion - to the difference. The intention is to make Advanced SIMD preferable - in cases where an Advanced SIMD version exists, without increasing - the costs so much that SVE won't be used at all. - - The reasoning is similar to the scalar vs. predicate comparison above: - if the issue rate of the SVE code is limited by predicate operations - (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the - Advanced SIMD code could issue within the limit imposed by the - predicate operations, the predicate operations are adding an - overhead that the original code didn't have and so we should prefer - the Advanced SIMD version. However, if the predicate operations - do not dominate in this way, we should only increase the cost of - the SVE code if sve_cycles_per_iter is strictly greater than - advsimd_cycles_per_iter. Given rounding effects, this should mean - that Advanced SIMD is either better or at least no worse. */ - if (sve_cycles_per_iter >= pred_cycles_per_iter) - sve_estimate = sve_cycles_per_iter; - if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) - { - /* This ensures that min_cost > orig_body_cost * 2. */ - unsigned int factor - = fractional_cost::scale (1, sve_estimate, - advsimd_cycles_per_iter); - unsigned int min_cost = orig_body_cost * factor + 1; - if (body_cost < min_cost) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Increasing body cost to %d because Advanced" - " SIMD code could issue as quickly\n", - min_cost); - body_cost = min_cost; - should_disparage = true; - } - } + vector_cycles_per_iter + = aarch64_adjust_body_cost_sve (costs, issue_info, + scalar_cycles_per_iter, + advsimd_cycles_per_iter, + could_use_advsimd, orig_body_cost, + &body_cost, &should_disparage); } /* Decide whether to stick to latency-based costs or whether to try to