Hi All,

When determining issue rates we currently discount non-constant MLA accumulators
for Advanced SIMD but don't do it for the latency.

This means the costs for Advanced SIMD with a constant accumulator are wrong and
results in us costing SVE and Advanced SIMD the same.  This can cauze us to
vectorize with Advanced SIMD instead of SVE in some cases.

This patch adds the same discount for SVE and Scalar as we do for issue rate.

My assumption was that on issue rate we reject all scalar constants early
because we take into account the extra instruction to create the constant?
Though I'd have expected this to be in prologue costs.  For this reason I added
an extra parameter to allow me to force the check to at least look for the
multiplication.

This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large
Neoverse cores.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param
        allow_constants. 
        (aarch64_adjust_stmt_cost): Use it.
        (aarch64_vector_costs::count_ops): Likewise.
        (aarch64_vector_costs::add_stmt_cost): Pass vinfo to
        aarch64_adjust_stmt_cost.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt 
kind,
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
    a scalar one, otherwise analyze it as an operation on vectors with those
-   VEC_* flags.  */
+   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators 
including
+   constant ones.  */
 static bool
 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
-                       unsigned int vec_flags)
+                       unsigned int vec_flags, bool allow_constants)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign)
@@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+  if (!allow_constants
+      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+         || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info,
       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
        continue;
 
-      if (vec_flags & VEC_ADVSIMD)
+      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
        {
          /* Scalar and SVE code can tie the result to any FMLA input (or none,
             although that requires a MOVPRFX for SVE).  However, Advanced SIMD
@@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info,
            return false;
          def_stmt_info = vinfo->lookup_def (rhs);
          if (!def_stmt_info
-             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
            return false;
        }
 
@@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, 
vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-                         tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+                         stmt_vec_info stmt_info, tree vectype,
+                         unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, 
stmt_vec_info stmt_info,
          break;
        }
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+       {
+         bool simd_p = vec_flags & VEC_ADVSIMD;
+         /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+         if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+           return 0;
+       }
+
       if (kind == vector_stmt || kind == vec_to_scalar)
        if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
          {
@@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, 
vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
         to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-                                           stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+                                           vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
         innermost loop, also estimate the operations that would need




-- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt 
kind,
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
    a scalar one, otherwise analyze it as an operation on vectors with those
-   VEC_* flags.  */
+   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators 
including
+   constant ones.  */
 static bool
 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
-                       unsigned int vec_flags)
+                       unsigned int vec_flags, bool allow_constants)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign)
@@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+  if (!allow_constants
+      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+         || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info,
       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
        continue;
 
-      if (vec_flags & VEC_ADVSIMD)
+      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
        {
          /* Scalar and SVE code can tie the result to any FMLA input (or none,
             although that requires a MOVPRFX for SVE).  However, Advanced SIMD
@@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info,
            return false;
          def_stmt_info = vinfo->lookup_def (rhs);
          if (!def_stmt_info
-             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
            return false;
        }
 
@@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, 
vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-                         tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+                         stmt_vec_info stmt_info, tree vectype,
+                         unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, 
stmt_vec_info stmt_info,
          break;
        }
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+       {
+         bool simd_p = vec_flags & VEC_ADVSIMD;
+         /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+         if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+           return 0;
+       }
+
       if (kind == vector_stmt || kind == vec_to_scalar)
        if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
          {
@@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, 
vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
         to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-                                           stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+                                           vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
         innermost loop, also estimate the operations that would need



Reply via email to