The patch is trying to unroll the vectorized loop when there're FMA/DOT_PRDO_EXPR reductions, it will break cross-iteration dependence and enable more parallelism(since vectorize will also enable partial sum).
When there's gather/scatter or scalarization in the loop, don't do the unroll since the performance bottleneck is not at the reduction. The unroll factor is set as + unsigned unroll_factor + = 1 << ceil_log2 (issue_rate / m_num_reduction); + + /* Default unroll limit 4. */ + m_suggested_unroll_factor + = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor); Note when DOT_PROD_EXPR is not native support, m_num_reduction += 3 * count which almost prevents unroll. There's performance boost for simple benchmark with DOT_PRDO_EXPR/FMA chain, slight improvement in SPEC2017 performance. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Any comments? gcc/ChangeLog: * config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs): Addd new memeber m_num_reduction, m_prefer_unroll. (ix86_vector_costs::add_stmt_cost): Set m_prefer_unroll and m_num_reduction. (ix86_vector_costs::finish_cost): Determine m_suggested_unroll_vector with consideration of issue_rate, m_num_reduction and ix86_vect_unroll_limit. * config/i386/i386.opt: Add -param=ix86-vect-unroll-limit. gcc/testsuite/ChangeLog: * gcc.target/i386/vect_unroll-1.c: New test. * gcc.target/i386/vect_unroll-2.c: New test. * gcc.target/i386/vect_unroll-3.c: New test. * gcc.target/i386/vect_unroll-4.c: New test. * gcc.target/i386/vect_unroll-5.c: New test. --- gcc/config/i386/i386.cc | 138 +++++++++++++++++- gcc/config/i386/i386.opt | 4 + gcc/testsuite/gcc.target/i386/vect_unroll-1.c | 12 ++ gcc/testsuite/gcc.target/i386/vect_unroll-2.c | 12 ++ gcc/testsuite/gcc.target/i386/vect_unroll-3.c | 12 ++ gcc/testsuite/gcc.target/i386/vect_unroll-4.c | 12 ++ gcc/testsuite/gcc.target/i386/vect_unroll-5.c | 13 ++ 7 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-1.c create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-2.c create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-3.c create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-4.c create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-5.c diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 49bd3939eb4..801f916b161 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -25762,15 +25762,20 @@ private: unsigned m_num_sse_needed[3]; /* Number of 256-bit vector permutation. */ unsigned m_num_avx256_vec_perm[3]; + /* Number of reductions for FMA/DOT_PROD_EXPR. */ + unsigned m_num_reduction; + /* Number of load/store/gather/scatter in loop body. */ + bool m_prefer_unroll; }; ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar) : vector_costs (vinfo, costing_for_scalar), m_num_gpr_needed (), m_num_sse_needed (), - m_num_avx256_vec_perm () -{ -} + m_num_avx256_vec_perm (), + m_num_reduction (), + m_prefer_unroll (true) +{} /* Implement targetm.vectorize.create_costs. */ @@ -26067,6 +26072,118 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, } } + /* Record number of load/store/gather/scatter in vectorized body. */ + if (where == vect_body && !m_costing_for_scalar) + { + switch (kind) + { + /* Emulated gather/scatter or any scalarization. */ + case scalar_load: + case scalar_stmt: + case scalar_store: + case vector_gather_load: + case vector_scatter_store: + m_prefer_unroll = false; + break; + + case vector_stmt: + case vec_to_scalar: + /* Count number of reduction FMA and "real" DOT_PROD_EXPR, + unroll in the vectorizer will enable partial sum. */ + if (stmt_info + && vect_is_reduction (stmt_info) + && stmt_info->stmt) + { + /* Handle __builtin_fma. */ + if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA) + { + m_num_reduction += count; + break; + } + + if (gimple_code (stmt_info->stmt) != GIMPLE_ASSIGN) + break; + + tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt); + machine_mode inner_mode = GET_MODE_INNER (mode); + tree rhs1, rhs2; + bool native_vnni_p = true; + gimple* def; + machine_mode mode_rhs; + switch (subcode) + { + case PLUS_EXPR: + case MINUS_EXPR: + if (!fp || !flag_associative_math + || flag_fp_contract_mode != FP_CONTRACT_FAST) + break; + + /* FMA condition for different modes. */ + if (((inner_mode == DFmode || inner_mode == SFmode) + && !TARGET_FMA && !TARGET_AVX512VL) + || (inner_mode == HFmode && !TARGET_AVX512FP16) + || (inner_mode == BFmode && !TARGET_AVX10_2)) + break; + + /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed + to FMA/FNMA after vectorization. */ + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); + if (subcode == PLUS_EXPR + && TREE_CODE (rhs1) == SSA_NAME + && (def = SSA_NAME_DEF_STMT (rhs1), true) + && is_gimple_assign (def) + && gimple_assign_rhs_code (def) == MULT_EXPR) + m_num_reduction += count; + else if (TREE_CODE (rhs2) == SSA_NAME + && (def = SSA_NAME_DEF_STMT (rhs2), true) + && is_gimple_assign (def) + && gimple_assign_rhs_code (def) == MULT_EXPR) + m_num_reduction += count; + break; + + case DOT_PROD_EXPR: + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); + mode_rhs = TYPE_MODE (TREE_TYPE (rhs1)); + if (mode_rhs == QImode) + { + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); + signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1)); + signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2)); + + /* vpdpbusd. */ + if (signop1_p != signop2_p) + native_vnni_p + = (GET_MODE_SIZE (mode) == 64 + ? TARGET_AVX512VNNI + : ((TARGET_AVX512VNNI && TARGET_AVX512VL) + || TARGET_AVXVNNI)); + else + /* vpdpbssd. */ + native_vnni_p + = (GET_MODE_SIZE (mode) == 64 + ? TARGET_AVX10_2 + : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2)); + } + m_num_reduction += count; + + /* Dislike to do unroll and partial sum for + emulated DOT_PROD_EXPR. */ + if (!native_vnni_p) + m_num_reduction += 3 * count; + break; + + default: + break; + } + } + + default: + break; + } + } + + combined_fn cfn; if ((kind == vector_stmt || kind == scalar_stmt) && stmt_info @@ -26282,6 +26399,21 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()) > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo)))) m_costs[vect_body] = INT_MAX; + + unsigned issue_rate = ix86_issue_rate (); + if (m_num_reduction && m_num_reduction < issue_rate + /* Not much gain for loop with gather and scatter. */ + && m_prefer_unroll + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + { + unsigned unroll_factor + = 1 << ceil_log2 (issue_rate / m_num_reduction); + + /* Default unroll limit 4. */ + m_suggested_unroll_factor + = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor); + } + } ix86_vect_estimate_reg_pressure (); diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index c93c0b1bb38..6bda22f4843 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1246,6 +1246,10 @@ munroll-only-small-loops Target Var(ix86_unroll_only_small_loops) Init(0) Optimization Enable conservative small loop unrolling. +-param=ix86-vect-unroll-limit= +Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param +Limit how much the autovectorizer may unroll a loop. + mlam= Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none) -mlam=[none|u48|u57] Instrument meta data position in user data pointers. diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-1.c b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c new file mode 100644 index 00000000000..2e294d3aea6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -Ofast" } */ +/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 4 } } */ + +float +foo (float* a, float* b, int n) +{ + float sum = 0; + for (int i = 0; i != n; i++) + sum += a[i] * b[i]; + return sum; +} diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-2.c b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c new file mode 100644 index 00000000000..069f7d37ae7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -Ofast" } */ +/* { dg-final { scan-assembler-times {(?n)vfnmadd[1-3]*ps[^\n]*ymm} 4 } } */ + +float +foo (float* a, float* b, int n) +{ + float sum = 0; + for (int i = 0; i != n; i++) + sum -= a[i] * b[i]; + return sum; +} diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-3.c b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c new file mode 100644 index 00000000000..6860c2ffbd5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-mavxvnni -O3" } */ +/* { dg-final { scan-assembler-times {(?n)vpdpbusd[^\n]*ymm} 4 } } */ + +int +foo (unsigned char* a, char* b, int n) +{ + int sum = 0; + for (int i = 0; i != n; i++) + sum += a[i] * b[i]; + return sum; +} diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-4.c b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c new file mode 100644 index 00000000000..442c2e9529f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -O3 -mno-avxvnni" } */ +/* { dg-final { scan-assembler-times {(?n)vpmaddwd[^\n]*ymm} 2 } } */ + +int +foo (unsigned char* a, char* b, int n) +{ + int sum = 0; + for (int i = 0; i != n; i++) + sum += a[i] * b[i]; + return sum; +} diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-5.c b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c new file mode 100644 index 00000000000..c6375b1bc8d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -Ofast -mgather" } */ +/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 1 } } */ + +float +foo (float* a, int* b, float* c, int n) +{ + float sum = 0; + for (int i = 0; i != n; i++) + sum += a[b[i]] *c[i]; + return sum; +} + -- 2.34.1