The patch is trying to unroll the vectorized loop when there're
FMA/DOT_PRDO_EXPR reductions, it will break cross-iteration dependence
and enable more parallelism(since vectorize will also enable partial
sum).

When there's gather/scatter or scalarization in the loop, don't do the
unroll since the performance bottleneck is not at the reduction.

The unroll factor is set as

+         unsigned unroll_factor
+           = 1 << ceil_log2 (issue_rate / m_num_reduction);
+
+         /* Default unroll limit 4.  */
+         m_suggested_unroll_factor
+           = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor);

Note when DOT_PROD_EXPR is not native support,
m_num_reduction += 3 * count which almost prevents unroll.

There's performance boost for simple benchmark with DOT_PRDO_EXPR/FMA
chain, slight improvement in SPEC2017 performance.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

        * config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs):
        Addd new memeber m_num_reduction, m_prefer_unroll.
        (ix86_vector_costs::add_stmt_cost): Set m_prefer_unroll and
        m_num_reduction.
        (ix86_vector_costs::finish_cost): Determine
        m_suggested_unroll_vector with consideration of issue_rate,
        m_num_reduction and ix86_vect_unroll_limit.
        * config/i386/i386.opt: Add -param=ix86-vect-unroll-limit.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/vect_unroll-1.c: New test.
        * gcc.target/i386/vect_unroll-2.c: New test.
        * gcc.target/i386/vect_unroll-3.c: New test.
        * gcc.target/i386/vect_unroll-4.c: New test.
        * gcc.target/i386/vect_unroll-5.c: New test.
---
 gcc/config/i386/i386.cc                       | 138 +++++++++++++++++-
 gcc/config/i386/i386.opt                      |   4 +
 gcc/testsuite/gcc.target/i386/vect_unroll-1.c |  12 ++
 gcc/testsuite/gcc.target/i386/vect_unroll-2.c |  12 ++
 gcc/testsuite/gcc.target/i386/vect_unroll-3.c |  12 ++
 gcc/testsuite/gcc.target/i386/vect_unroll-4.c |  12 ++
 gcc/testsuite/gcc.target/i386/vect_unroll-5.c |  13 ++
 7 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-5.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 49bd3939eb4..801f916b161 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25762,15 +25762,20 @@ private:
   unsigned m_num_sse_needed[3];
   /* Number of 256-bit vector permutation.  */
   unsigned m_num_avx256_vec_perm[3];
+  /* Number of reductions for FMA/DOT_PROD_EXPR.  */
+  unsigned m_num_reduction;
+  /* Number of load/store/gather/scatter in loop body.  */
+  bool m_prefer_unroll;
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
     m_num_gpr_needed (),
     m_num_sse_needed (),
-    m_num_avx256_vec_perm ()
-{
-}
+    m_num_avx256_vec_perm (),
+    m_num_reduction (),
+    m_prefer_unroll (true)
+{}
 
 /* Implement targetm.vectorize.create_costs.  */
 
@@ -26067,6 +26072,118 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        }
     }
 
+  /* Record number of load/store/gather/scatter in vectorized body.  */
+  if (where == vect_body && !m_costing_for_scalar)
+    {
+      switch (kind)
+       {
+         /* Emulated gather/scatter or any scalarization.  */
+       case scalar_load:
+       case scalar_stmt:
+       case scalar_store:
+       case vector_gather_load:
+       case vector_scatter_store:
+         m_prefer_unroll = false;
+         break;
+
+       case vector_stmt:
+       case vec_to_scalar:
+         /* Count number of reduction FMA and "real" DOT_PROD_EXPR,
+            unroll in the vectorizer will enable partial sum.  */
+         if (stmt_info
+             && vect_is_reduction (stmt_info)
+             && stmt_info->stmt)
+           {
+             /* Handle __builtin_fma.  */
+             if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
+               {
+                 m_num_reduction += count;
+                 break;
+               }
+
+             if (gimple_code (stmt_info->stmt) != GIMPLE_ASSIGN)
+               break;
+
+             tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
+             machine_mode inner_mode = GET_MODE_INNER (mode);
+             tree rhs1, rhs2;
+             bool native_vnni_p = true;
+             gimple* def;
+             machine_mode mode_rhs;
+             switch (subcode)
+               {
+               case PLUS_EXPR:
+               case MINUS_EXPR:
+                 if (!fp || !flag_associative_math
+                     || flag_fp_contract_mode != FP_CONTRACT_FAST)
+                   break;
+
+                 /* FMA condition for different modes.  */
+                 if (((inner_mode == DFmode || inner_mode == SFmode)
+                      && !TARGET_FMA && !TARGET_AVX512VL)
+                     || (inner_mode == HFmode && !TARGET_AVX512FP16)
+                     || (inner_mode == BFmode && !TARGET_AVX10_2))
+                   break;
+
+                 /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed
+                    to FMA/FNMA after vectorization.  */
+                 rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+                 rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+                 if (subcode == PLUS_EXPR
+                     && TREE_CODE (rhs1) == SSA_NAME
+                     && (def = SSA_NAME_DEF_STMT (rhs1), true)
+                     && is_gimple_assign (def)
+                     && gimple_assign_rhs_code (def) == MULT_EXPR)
+                   m_num_reduction += count;
+                 else if (TREE_CODE (rhs2) == SSA_NAME
+                          && (def = SSA_NAME_DEF_STMT (rhs2), true)
+                          && is_gimple_assign (def)
+                          && gimple_assign_rhs_code (def) == MULT_EXPR)
+                   m_num_reduction += count;
+                 break;
+
+               case DOT_PROD_EXPR:
+                 rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+                 mode_rhs = TYPE_MODE (TREE_TYPE (rhs1));
+                 if (mode_rhs == QImode)
+                   {
+                     rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+                     signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1));
+                     signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2));
+
+                     /* vpdpbusd.  */
+                     if (signop1_p != signop2_p)
+                       native_vnni_p
+                         = (GET_MODE_SIZE (mode) == 64
+                            ? TARGET_AVX512VNNI
+                            : ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+                               || TARGET_AVXVNNI));
+                     else
+                       /* vpdpbssd.  */
+                       native_vnni_p
+                         = (GET_MODE_SIZE (mode) == 64
+                            ? TARGET_AVX10_2
+                            : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
+                   }
+                 m_num_reduction += count;
+
+                 /* Dislike to do unroll and partial sum for
+                    emulated DOT_PROD_EXPR.  */
+                 if (!native_vnni_p)
+                   m_num_reduction += 3 * count;
+                 break;
+
+               default:
+                 break;
+               }
+           }
+
+       default:
+         break;
+       }
+    }
+
+
   combined_fn cfn;
   if ((kind == vector_stmt || kind == scalar_stmt)
       && stmt_info
@@ -26282,6 +26399,21 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
          && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
              > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
        m_costs[vect_body] = INT_MAX;
+
+      unsigned issue_rate = ix86_issue_rate ();
+      if (m_num_reduction && m_num_reduction < issue_rate
+         /* Not much gain for loop with gather and scatter.  */
+         && m_prefer_unroll
+         && !LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+       {
+         unsigned unroll_factor
+           = 1 << ceil_log2 (issue_rate / m_num_reduction);
+
+         /* Default unroll limit 4.  */
+         m_suggested_unroll_factor
+           = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor);
+       }
+
     }
 
   ix86_vect_estimate_reg_pressure ();
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c93c0b1bb38..6bda22f4843 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1246,6 +1246,10 @@ munroll-only-small-loops
 Target Var(ix86_unroll_only_small_loops) Init(0) Optimization
 Enable conservative small loop unrolling.
 
+-param=ix86-vect-unroll-limit=
+Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param
+Limit how much the autovectorizer may unroll a loop.
+
 mlam=
 Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
 -mlam=[none|u48|u57] Instrument meta data position in user data pointers.
diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-1.c 
b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c
new file mode 100644
index 00000000000..2e294d3aea6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -Ofast" } */
+/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 4 } } */
+
+float
+foo (float* a, float* b, int n)
+{
+  float sum = 0;
+  for (int i = 0; i != n; i++)
+    sum += a[i] * b[i];
+  return sum;
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-2.c 
b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c
new file mode 100644
index 00000000000..069f7d37ae7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -Ofast" } */
+/* { dg-final { scan-assembler-times {(?n)vfnmadd[1-3]*ps[^\n]*ymm} 4 } } */
+
+float
+foo (float* a, float* b, int n)
+{
+  float sum = 0;
+  for (int i = 0; i != n; i++)
+    sum -= a[i] * b[i];
+  return sum;
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-3.c 
b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c
new file mode 100644
index 00000000000..6860c2ffbd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-mavxvnni -O3" } */
+/* { dg-final { scan-assembler-times {(?n)vpdpbusd[^\n]*ymm} 4 } } */
+
+int
+foo (unsigned char* a, char* b, int n)
+{
+  int sum = 0;
+  for (int i = 0; i != n; i++)
+    sum += a[i] * b[i];
+  return sum;
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-4.c 
b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c
new file mode 100644
index 00000000000..442c2e9529f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O3 -mno-avxvnni" } */
+/* { dg-final { scan-assembler-times {(?n)vpmaddwd[^\n]*ymm} 2 } } */
+
+int
+foo (unsigned char* a, char* b, int n)
+{
+  int sum = 0;
+  for (int i = 0; i != n; i++)
+    sum += a[i] * b[i];
+  return sum;
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-5.c 
b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c
new file mode 100644
index 00000000000..c6375b1bc8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -Ofast -mgather" } */
+/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 1 } } */
+
+float
+foo (float* a, int* b, float* c, int n)
+{
+  float sum = 0;
+  for (int i = 0; i != n; i++)
+    sum += a[b[i]] *c[i];
+  return sum;
+}
+
-- 
2.34.1

Reply via email to