[gcc r13-9309] Zen5 tuning part 5: update instruction latencies in x86-tune-costs

Jan Hubicka via Gcc-cvs Sun, 12 Jan 2025 14:00:07 -0800

https://gcc.gnu.org/g:f10d381dfc983ea32e5f72faadc7eb8126f114f6


commit r13-9309-gf10d381dfc983ea32e5f72faadc7eb8126f114f6
Author: Jan Hubicka <j...@suse.cz>
Date:   Wed Sep 4 09:19:08 2024 +0200

    Zen5 tuning part 5: update instruction latencies in x86-tune-costs
    
    there is nothing exciting in this patch.  I measured latencies and also 
compared
    them with newly released optimization guide.  There are no dramatic changes
    compared to zen4.  One interesting new bit is that addss is faster and can 
be
    2 cycles when fed by another addss.
    
    I also increased the large insn bound since decoders seems no longer require
    instructions to be 8 bytes or less.
    
    gcc/ChangeLog:
    
            * config/i386/x86-tune-costs.h (znver5_cost): Update instruction
            costs.
    
    (cherry picked from commit 4292297a0f938ffc953422fa246ff00fe345fe3d)

Diff:
---
 gcc/config/i386/x86-tune-costs.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index b89ac640ea5f..9edc6e36557d 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2034,6 +2034,7 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
   COSTS_N_INSNS (1),                   /* variable shift costs.  */
   COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  /* mul has latency 3, executes in 3 integer units.  */
   {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
    COSTS_N_INSNS (3),                  /*                               HI.  */
    COSTS_N_INSNS (3),                  /*                               SI.  */
@@ -2041,6 +2042,8 @@ struct processor_costs znver5_cost = {
    COSTS_N_INSNS (3)},                 /*                      other.  */
   0,                                   /* cost of multiply per each bit
                                           set.  */
+  /* integer divide has latency of 8 cycles
+     plus 1 for every 9 bits of quotient.  */
   {COSTS_N_INSNS (10),                 /* cost of a divide/mod for QI.  */
    COSTS_N_INSNS (11),                 /*                          HI.  */
    COSTS_N_INSNS (13),                 /*                          SI.  */
@@ -2048,7 +2051,7 @@ struct processor_costs znver5_cost = {
    COSTS_N_INSNS (16)},                        /*                          
other.  */
   COSTS_N_INSNS (1),                   /* cost of movsx.  */
   COSTS_N_INSNS (1),                   /* cost of movzx.  */
-  8,                                   /* "large" insn.  */
+  15,                                  /* "large" insn.  */
   9,                                   /* MOVE_RATIO.  */
   6,                                   /* CLEAR_RATIO */
   {6, 6, 6},                           /* cost of loading integer registers
@@ -2065,12 +2068,13 @@ struct processor_costs znver5_cost = {
   2, 2, 2,                             /* cost of moving XMM,YMM,ZMM
                                           register.  */
   6,                                   /* cost of moving SSE register to 
integer.  */
-  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
-     throughput 5.  Approx 7 uops do not depend on vector size and every load
-     is 5 uops.  */
+
+  /* TODO: gather and scatter instructions are currently disabled in
+     x86-tune.def.  In some cases they are however a win, see PR116582
+     We however need good cost model for them.  */
   14, 10,                              /* Gather load static, per_elt.  */
   14, 20,                              /* Gather store static, per_elt.  */
-  32,                                  /* size of l1 cache.  */
+  48,                                  /* size of l1 cache.  */
   1024,                                        /* size of l2 cache.  */
   64,                                  /* size of prefetch block.  */
   /* New AMD processors never drop prefetches; if they cannot be performed
@@ -2080,6 +2084,8 @@ struct processor_costs znver5_cost = {
      time).  */
   100,                                 /* number of parallel prefetches.  */
   3,                                   /* Branch cost.  */
+  /* TODO x87 latencies are still based on znver4.
+     Probably not very important these days.  */
   COSTS_N_INSNS (7),                   /* cost of FADD and FSUB insns.  */
   COSTS_N_INSNS (7),                   /* cost of FMUL instruction.  */
   /* Latency of fdiv is 8-15.  */
@@ -2089,16 +2095,24 @@ struct processor_costs znver5_cost = {
   /* Latency of fsqrt is 4-10.  */
   COSTS_N_INSNS (25),                  /* cost of FSQRT instruction.  */
 
+  /* SSE instructions have typical throughput 4 and latency 1.  */
   COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  /* ADDSS has throughput 2 and latency 2
+     (in some cases when source is another addition).  */
+  COSTS_N_INSNS (2),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  /* MULSS has throughput 2 and latency 3.  */
   COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
   COSTS_N_INSNS (3),                   /* cost of MULSD instruction.  */
+  /* FMA had throughput 2 and latency 4.  */
   COSTS_N_INSNS (4),                   /* cost of FMA SS instruction.  */
   COSTS_N_INSNS (4),                   /* cost of FMA SD instruction.  */
+  /* DIVSS has throughtput 0.4 and latency 10.  */
   COSTS_N_INSNS (10),                  /* cost of DIVSS instruction.  */
-  /* 9-13.  */
+  /* DIVSD has throughtput 0.25 and latency 13.  */
   COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
+  /* DIVSD has throughtput 0.22 and latency 14.  */
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
+  /* DIVSD has throughtput 0.13 and latency 20.  */
   COSTS_N_INSNS (20),                  /* cost of SQRTSD instruction.  */
   /* Zen5 can execute:
       - integer ops: 6 per cycle, at most 3 multiplications.

[gcc r13-9309] Zen5 tuning part 5: update instruction latencies in x86-tune-costs

Reply via email to