Hi All, When determining issue rates we currently discount non-constant MLA accumulators for Advanced SIMD but don't do it for the latency.
This means the costs for Advanced SIMD with a constant accumulator are wrong and results in us costing SVE and Advanced SIMD the same. This can cauze us to vectorize with Advanced SIMD instead of SVE in some cases. This patch adds the same discount for SVE and Scalar as we do for issue rate. My assumption was that on issue rate we reject all scalar constants early because we take into account the extra instruction to create the constant? Though I'd have expected this to be in prologue costs. For this reason I added an extra parameter to allow me to force the check to at least look for the multiplication. This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large Neoverse cores. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param allow_constants. (aarch64_adjust_stmt_cost): Use it. (aarch64_vector_costs::count_ops): Likewise. (aarch64_vector_costs::add_stmt_cost): Pass vinfo to aarch64_adjust_stmt_cost. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind, or multiply-subtract sequence that might be suitable for fusing into a single instruction. If VEC_FLAGS is zero, analyze the operation as a scalar one, otherwise analyze it as an operation on vectors with those - VEC_* flags. */ + VEC_* flags. When ALLOW_CONSTANTS we'll recognize all accumulators including + constant ones. */ static bool aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, - unsigned int vec_flags) + unsigned int vec_flags, bool allow_constants) { gassign *assign = dyn_cast<gassign *> (stmt_info->stmt); if (!assign) @@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, if (code != PLUS_EXPR && code != MINUS_EXPR) return false; - if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) - || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))) + if (!allow_constants + && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) + || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))) return false; for (int i = 1; i < 3; ++i) @@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR) continue; - if (vec_flags & VEC_ADVSIMD) + if (!allow_constants && (vec_flags & VEC_ADVSIMD)) { /* Scalar and SVE code can tie the result to any FMLA input (or none, although that requires a MOVPRFX for SVE). However, Advanced SIMD @@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, return false; def_stmt_info = vinfo->lookup_def (rhs); if (!def_stmt_info - || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def) + || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def + || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def) return false; } @@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, and which when vectorized would operate on vector type VECTYPE. Add the cost of any embedded operations. */ static fractional_cost -aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, - tree vectype, fractional_cost stmt_cost) +aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, tree vectype, + unsigned vec_flags, fractional_cost stmt_cost) { if (vectype) { @@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, break; } + gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)); + if (assign && !vect_is_reduction (stmt_info)) + { + bool simd_p = vec_flags & VEC_ADVSIMD; + /* For MLA we need to reduce the cost since MLA is 1 instruction. */ + if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p)) + return 0; + } + if (kind == vector_stmt || kind == vec_to_scalar) if (tree cmp_type = vect_embedded_comparison_type (stmt_info)) { @@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, } /* Assume that multiply-adds will become a single operation. */ - if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags)) + if (stmt_info + && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false)) return; /* Count the basic operation cost associated with KIND. */ @@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, { /* Account for any extra "embedded" costs that apply additively to the base cost calculated above. */ - stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, - stmt_cost); + stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, + vectype, m_vec_flags, stmt_cost); /* If we're recording a nonzero vector loop body cost for the innermost loop, also estimate the operations that would need --
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind, or multiply-subtract sequence that might be suitable for fusing into a single instruction. If VEC_FLAGS is zero, analyze the operation as a scalar one, otherwise analyze it as an operation on vectors with those - VEC_* flags. */ + VEC_* flags. When ALLOW_CONSTANTS we'll recognize all accumulators including + constant ones. */ static bool aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, - unsigned int vec_flags) + unsigned int vec_flags, bool allow_constants) { gassign *assign = dyn_cast<gassign *> (stmt_info->stmt); if (!assign) @@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, if (code != PLUS_EXPR && code != MINUS_EXPR) return false; - if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) - || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))) + if (!allow_constants + && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign)) + || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))) return false; for (int i = 1; i < 3; ++i) @@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR) continue; - if (vec_flags & VEC_ADVSIMD) + if (!allow_constants && (vec_flags & VEC_ADVSIMD)) { /* Scalar and SVE code can tie the result to any FMLA input (or none, although that requires a MOVPRFX for SVE). However, Advanced SIMD @@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, return false; def_stmt_info = vinfo->lookup_def (rhs); if (!def_stmt_info - || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def) + || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def + || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def) return false; } @@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, and which when vectorized would operate on vector type VECTYPE. Add the cost of any embedded operations. */ static fractional_cost -aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, - tree vectype, fractional_cost stmt_cost) +aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, tree vectype, + unsigned vec_flags, fractional_cost stmt_cost) { if (vectype) { @@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, break; } + gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info)); + if (assign && !vect_is_reduction (stmt_info)) + { + bool simd_p = vec_flags & VEC_ADVSIMD; + /* For MLA we need to reduce the cost since MLA is 1 instruction. */ + if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p)) + return 0; + } + if (kind == vector_stmt || kind == vec_to_scalar) if (tree cmp_type = vect_embedded_comparison_type (stmt_info)) { @@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, } /* Assume that multiply-adds will become a single operation. */ - if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags)) + if (stmt_info + && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false)) return; /* Count the basic operation cost associated with KIND. */ @@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, { /* Account for any extra "embedded" costs that apply additively to the base cost calculated above. */ - stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, - stmt_cost); + stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, + vectype, m_vec_flags, stmt_cost); /* If we're recording a nonzero vector loop body cost for the innermost loop, also estimate the operations that would need