From: Lili Cui <lili....@intel.com>

Set the length of the chain with FMA to 5 for icelake_cost.

With this patch applied,
SPR multi-copy: 508.namd_r increased by 3%
ICX multi-copy: 508.namd_r increased by 3.5%,
                507.cactuBSSN_r increased by 3.7%

Using FMA instead of mult + add reduces register pressure and insruction
retired.

gcc/ChangeLog:

        * config/i386/i386-options.cc (ix86_option_override_internal):
        Set param_max_reassoc_fma_chain_length.
        * config/i386/i386.h (struct processor_costs): Add new tune parameters.
        * config/i386/x86-tune-costs.h (struct processor_costs): Set
        reassoc_max_chain_length_with_fma to 5 for icelake.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/fma-chain.c: New test.
---
 gcc/config/i386/i386-options.cc           |  2 ++
 gcc/config/i386/i386.h                    |  3 ++
 gcc/config/i386/x86-tune-costs.h          | 35 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/fma-chain.c | 11 +++++++
 4 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/fma-chain.c

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..67d35d89d91 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2684,6 +2684,8 @@ ix86_option_override_internal (bool main_args_p,
                       ix86_tune_cost->l1_cache_size);
   SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size,
                       ix86_tune_cost->l2_cache_size);
+  SET_OPTION_IF_UNSET (opts, opts_set, param_reassoc_max_chain_length_with_fma,
+                      ix86_tune_cost->reassoc_max_chain_length_with_fma);
 
   /* 64B is the accepted value for these for all x86.  */
   SET_OPTION_IF_UNSET (&global_options, &global_options_set,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f89bdf..c7fa7312a67 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -206,6 +206,9 @@ struct processor_costs {
                                   to number of instructions executed in
                                   parallel.  See also
                                   ix86_reassociation_width.  */
+  const int reassoc_max_chain_length_with_fma;
+                               /* Specify max reassociation chain length with
+                                  FMA.  */
   struct stringop_algs *memcpy, *memset;
   const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
                                          cost model.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 4f7a67ca5c5..1f57a5ee2a7 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -127,6 +127,7 @@ struct processor_costs ix86_size_cost = {/* costs for 
tuning for size */
   COSTS_N_BYTES (2),                   /* cost of SQRTSS instruction.  */
   COSTS_N_BYTES (2),                   /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   ix86_size_memcpy,
   ix86_size_memset,
   COSTS_N_BYTES (1),                   /* cond_taken_branch_cost.  */
@@ -238,6 +239,7 @@ struct processor_costs i386_cost = {        /* 386 specific 
costs */
   COSTS_N_INSNS (122),                 /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (122),                 /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   i386_memcpy,
   i386_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -350,6 +352,7 @@ struct processor_costs i486_cost = {        /* 486 specific 
costs */
   COSTS_N_INSNS (83),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (83),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   i486_memcpy,
   i486_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -460,6 +463,7 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (70),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (70),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -563,6 +567,7 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -681,6 +686,7 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (31),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -790,6 +796,7 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (54),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (54),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   geode_memcpy,
   geode_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -902,6 +909,7 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (56),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (56),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   k6_memcpy,
   k6_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -1015,6 +1023,7 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (19),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   athlon_memcpy,
   athlon_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -1137,6 +1146,7 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   k8_memcpy,
   k8_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -1267,6 +1277,7 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   amdfam10_memcpy,
   amdfam10_memset,
   COSTS_N_INSNS (2),                   /* cond_taken_branch_cost.  */
@@ -1390,6 +1401,7 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (26),                  /* cost of SQRTSD instruction.  */
   1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   bdver_memcpy,
   bdver_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -1545,6 +1557,7 @@ struct processor_costs znver1_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   znver1_memcpy,
   znver1_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -1704,6 +1717,7 @@ struct processor_costs znver2_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -1838,6 +1852,7 @@ struct processor_costs znver3_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -1974,6 +1989,7 @@ struct processor_costs znver4_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -2100,6 +2116,7 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (12),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
   1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   skylake_memcpy,
   skylake_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -2228,6 +2245,12 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (12),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
   1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Icelake-server prefers fma chains instead of breaking dependencies into
+     mult + add, which can reduce instruction retired. 1 means not to keep
+     the fma chain. When the value big than 1, we will generate fma chain.
+     When the actual fma chain length is greater than this value, the fma
+     chain will be split with width.  */
+  5,                                   /* Reassoc max FMA chain length.  */
   icelake_memcpy,
   icelake_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -2350,6 +2373,7 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   alderlake_memcpy,
   alderlake_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -2465,6 +2489,7 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (48),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   btver1_memcpy,
   btver1_memset,
   COSTS_N_INSNS (2),                   /* cond_taken_branch_cost.  */
@@ -2577,6 +2602,7 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (16),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (21),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   btver2_memcpy,
   btver2_memset,
   COSTS_N_INSNS (2),                   /* cond_taken_branch_cost.  */
@@ -2688,6 +2714,7 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (23),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (38),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   pentium4_memcpy,
   pentium4_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -2802,6 +2829,7 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (32),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (41),                  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   nocona_memcpy,
   nocona_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -2914,6 +2942,7 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),                  /* cost of SQRTSD instruction.  */
   2, 2, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   atom_memcpy,
   atom_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -3026,6 +3055,7 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (20),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (35),                  /* cost of SQRTSD instruction.  */
   1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   slm_memcpy,
   slm_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -3152,6 +3182,7 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   tremont_memcpy,
   tremont_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -3264,6 +3295,7 @@ struct processor_costs intel_cost = {
   COSTS_N_INSNS (40),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (40),                  /* cost of SQRTSD instruction.  */
   1, 4, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   intel_memcpy,
   intel_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
@@ -3381,6 +3413,7 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (32),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (60),                  /* cost of SQRTSD instruction.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   lujiazui_memcpy,
   lujiazui_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -3502,6 +3535,7 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   generic_memcpy,
   generic_memset,
   COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
@@ -3630,6 +3664,7 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (30),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (58),                  /* cost of SQRTSD instruction.  */
   1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  1,                                   /* Reassoc max FMA chain length.  */
   core_memcpy,
   core_memset,
   COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
diff --git a/gcc/testsuite/gcc.target/i386/fma-chain.c 
b/gcc/testsuite/gcc.target/i386/fma-chain.c
new file mode 100644
index 00000000000..9de61f1b6ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma-chain.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=icelake-server -Wno-attributes " } */
+
+/* Test that the compiler properly optimizes multiply and add
+   to generate more FMA instructions.  */
+float
+foo (float a, float b, float c, float d, float e, float f, float g, float h, 
float j)
+{
+   return a * b + c * d + e * f + g * h + j;
+}
+/* { dg-final { scan-assembler-times "vfm" 4 } } */
-- 
2.25.1

Reply via email to