Zen5 tuning part 5: update instruction latencies in x86-tune-costs

Jan Hubicka Thu, 05 Sep 2024 05:22:14 -0700

Hi,
there is nothing exciting in this patch.  I measured latencies and also
compared them with newly released optimization guide and it seems that
only important change is that addss is fastr now. It can be 2 cycles
instaead of 3 in some cases when the input parameter is computed by
another addition. The throughput has increased but we have no model for
that.


I added comments whic should make it easier to update the table for
future revisions.

I also increased the large insn bound since decoders seems no longer
require instructions to be 8 bytes or less.

Bootstrapped/rgtested x86_64-linux, comitted.

gcc/ChangeLog:

        * config/i386/x86-tune-costs.h (znver5_cost): Update instruction
        costs.

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index b90567fbbf2..1b3227ace16 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2034,6 +2034,7 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
   COSTS_N_INSNS (1),                   /* variable shift costs.  */
   COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  /* mul has latency 3, executes in 3 integer units.  */
   {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
    COSTS_N_INSNS (3),                  /*                               HI.  */
    COSTS_N_INSNS (3),                  /*                               SI.  */
@@ -2041,6 +2042,8 @@ struct processor_costs znver5_cost = {
    COSTS_N_INSNS (3)},                 /*                      other.  */
   0,                                   /* cost of multiply per each bit
                                           set.  */
+  /* integer divide has latency of 8 cycles
+     plus 1 for every 9 bits of quotient.  */
   {COSTS_N_INSNS (10),                 /* cost of a divide/mod for QI.  */
    COSTS_N_INSNS (11),                 /*                          HI.  */
    COSTS_N_INSNS (13),                 /*                          SI.  */
@@ -2048,7 +2051,7 @@ struct processor_costs znver5_cost = {
    COSTS_N_INSNS (16)},                        /*                          
other.  */
   COSTS_N_INSNS (1),                   /* cost of movsx.  */
   COSTS_N_INSNS (1),                   /* cost of movzx.  */
-  8,                                   /* "large" insn.  */
+  15,                                  /* "large" insn.  */
   9,                                   /* MOVE_RATIO.  */
   6,                                   /* CLEAR_RATIO */
   {6, 6, 6},                           /* cost of loading integer registers
@@ -2065,12 +2068,13 @@ struct processor_costs znver5_cost = {
   2, 2, 2,                             /* cost of moving XMM,YMM,ZMM
                                           register.  */
   6,                                   /* cost of moving SSE register to 
integer.  */
-  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
-     throughput 5.  Approx 7 uops do not depend on vector size and every load
-     is 5 uops.  */
+
+  /* TODO: gather and scatter instructions are currently disabled in
+     x86-tune.def.  In some cases they are however a win, see PR116582
+     We however need good cost model for them.  */
   14, 10,                              /* Gather load static, per_elt.  */
   14, 20,                              /* Gather store static, per_elt.  */
-  32,                                  /* size of l1 cache.  */
+  48,                                  /* size of l1 cache.  */
   1024,                                        /* size of l2 cache.  */
   64,                                  /* size of prefetch block.  */
   /* New AMD processors never drop prefetches; if they cannot be performed
@@ -2080,6 +2084,8 @@ struct processor_costs znver5_cost = {
      time).  */
   100,                                 /* number of parallel prefetches.  */
   3,                                   /* Branch cost.  */
+  /* TODO x87 latencies are still based on znver4.
+     Probably not very important these days.  */
   COSTS_N_INSNS (7),                   /* cost of FADD and FSUB insns.  */
   COSTS_N_INSNS (7),                   /* cost of FMUL instruction.  */
   /* Latency of fdiv is 8-15.  */
@@ -2089,16 +2095,24 @@ struct processor_costs znver5_cost = {
   /* Latency of fsqrt is 4-10.  */
   COSTS_N_INSNS (25),                  /* cost of FSQRT instruction.  */
 
+  /* SSE instructions have typical throughput 4 and latency 1.  */
   COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  /* ADDSS has throughput 2 and latency 2
+     (in some cases when source is another addition).  */
+  COSTS_N_INSNS (2),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  /* MULSS has throughput 2 and latency 3.  */
   COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
   COSTS_N_INSNS (3),                   /* cost of MULSD instruction.  */
+  /* FMA had throughput 2 and latency 4.  */
   COSTS_N_INSNS (4),                   /* cost of FMA SS instruction.  */
   COSTS_N_INSNS (4),                   /* cost of FMA SD instruction.  */
+  /* DIVSS has throughtput 0.4 and latency 10.  */
   COSTS_N_INSNS (10),                  /* cost of DIVSS instruction.  */
-  /* 9-13.  */
+  /* DIVSD has throughtput 0.25 and latency 13.  */
   COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
+  /* DIVSD has throughtput 0.22 and latency 14.  */
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
+  /* DIVSD has throughtput 0.13 and latency 20.  */
   COSTS_N_INSNS (20),                  /* cost of SQRTSD instruction.  */
   /* Zen5 can execute:
       - integer ops: 6 per cycle, at most 3 multiplications.

Zen5 tuning part 5: update instruction latencies in x86-tune-costs

Reply via email to