memset inline strategies for -mtune=tremont

lili.cui--- via Gcc-patches Wed, 15 Sep 2021 01:13:07 -0700

From: "H.J. Lu" <hjl.to...@gmail.com>

Simply memcpy and memset inline strategies to avoid branches for
-mtune=tremont:


1. Create Tremont cost model from generic cost model.
2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
   load and store for up to 16 * 16 (256) bytes when the data size is
   fixed and known.
3. Inline only if data size is known to be <= 256.
   a. Use "rep movsb/stosb" with simple code sequence if the data size
      is a constant.
   b. Use loop if data size is not a constant.
4. Use memcpy/memset libray function if data size is unknown or > 256.

        * config/i386/i386-options.c (processor_cost_table): Use
        tremont_cost for Tremont.
        * config/i386/x86-tune-costs.h (tremont_memcpy): New.
        (tremont_memset): Likewise.
        (tremont_cost): Likewise.
        * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
        Enable for Tremont.
---
 gcc/config/i386/i386-options.c   |   2 +-
 gcc/config/i386/x86-tune-costs.h | 124 +++++++++++++++++++++++++++++++
 gcc/config/i386/x86-tune.def     |   2 +-
 3 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index c0006b3674b..e7a3bd4aaea 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -724,7 +724,7 @@ static const struct processor_costs *processor_cost_table[] 
=
   &slm_cost,
   &slm_cost,
   &slm_cost,
-  &slm_cost,
+  &tremont_cost,
   &slm_cost,
   &slm_cost,
   &skylake_cost,
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f2bcb..93644be9cb3 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
   "16",                                        /* Func alignment.  */
 };
 
+static stringop_algs tremont_memcpy[2] = {
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
+static stringop_algs tremont_memset[2] = {
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
+static const
+struct processor_costs tremont_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,                                /* cost for loading QImode using movzbl */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  4,                                   /* cost of reg,reg fld/fst */
+  {6, 6, 12},                          /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode */
+  {6, 6, 12},                          /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode */
+  2,                                   /* cost of moving MMX register */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode */
+  {6, 6},                              /* cost of storing MMX registers
+                                          in SImode and DImode */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register 
*/
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit */
+  6, 6,                                /* SSE->integer and integer->SSE moves 
*/
+  6, 6,                                /* mask->integer and integer->mask 
moves */
+  {6, 6, 6},                           /* cost of loading mask register
+                                          in QImode, HImode, SImode.  */
+  {6, 6, 6},                   /* cost if storing mask register
+                                          in QImode, HImode, SImode.  */
+  2,                                   /* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction */
+  /* Setting cost to 2 makes our current implementation of synth_mult result in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
+  COSTS_N_INSNS (1),                   /* variable shift costs */
+  COSTS_N_INSNS (1),                   /* constant shift costs */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),                  /*                               HI */
+   COSTS_N_INSNS (3),                  /*                               SI */
+   COSTS_N_INSNS (4),                  /*                               DI */
+   COSTS_N_INSNS (4)},                 /*                            other */
+  0,                                   /* cost of multiply per each bit set */
+  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI */
+   COSTS_N_INSNS (22),                 /*                          HI */
+   COSTS_N_INSNS (30),                 /*                          SI */
+   COSTS_N_INSNS (74),                 /*                          DI */
+   COSTS_N_INSNS (74)},                        /*                          
other */
+  COSTS_N_INSNS (1),                   /* cost of movsx */
+  COSTS_N_INSNS (1),                   /* cost of movzx */
+  8,                                   /* "large" insn */
+  17,                                  /* MOVE_RATIO */
+  17,                                  /* CLEAR_RATIO */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {6, 6, 6},                           /* cost of storing integer registers */
+  {6, 6, 6, 10, 15},                   /* cost of loading SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 
512bit */
+  {6, 6, 6, 10, 15},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 
512bit */
+  {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
+  {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
+  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM register 
*/
+  6,                                   /* cost of moving SSE register to 
integer.  */
+  18, 6,                               /* Gather load static, per_elt.  */
+  18, 6,                               /* Gather store static, per_elt.  */
+  32,                                  /* size of l1 cache.  */
+  512,                                 /* size of l2 cache.  */
+  64,                                  /* size of prefetch block */
+  6,                                   /* number of parallel prefetches */
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
+     value is increased to perhaps more appropriate value of 5.  */
+  3,                                   /* Branch cost */
+  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (5),                   /* cost of FMUL instruction.  */
+  COSTS_N_INSNS (17),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  COSTS_N_INSNS (14),                  /* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),                   /* cost of MULSS instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of MULSD instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (13),                  /* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (17),                  /* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
+  1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  tremont_memcpy,
+  tremont_memset,
+  COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
+  "16:11:8",                           /* Loop alignment.  */
+  "16:11:8",                           /* Jump alignment.  */
+  "0:0:8",                             /* Label alignment.  */
+  "16",                                        /* Func alignment.  */
+};
+
 static stringop_algs intel_memcpy[2] = {
   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 385e275bbd9..088edb6c4ca 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", 
m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
          "prefer_known_rep_movsb_stosb",
-         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+         m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
-- 
2.17.1

[PATCH 2/4] [PATCH 2/4] x86: Update memcpy/memset inline strategies for -mtune=tremont

Reply via email to