This first patch is no-op and it only copies the cost tables. I will adjust
them one-
by-one for easier hunting of possible regressions.
Honza
2021-03-15 Jan Hubicka <hubi...@ucw.cz>
* config/i386/i386-options.c (processor_cost_table): Add znver3_cost.
* config/i386/x86-tune-costs.h (znver3_cost): New gobal variable; copy
of znver2_cost.
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index e93935f6f2c..7865bc110a3 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -743,7 +743,7 @@ static const struct processor_costs
*processor_cost_table[] =
&btver2_cost,
&znver1_cost,
&znver2_cost,
- &znver2_cost
+ &znver3_cost
};
/* Guarantee that the array is aligned with enum processor_type. */ diff
--git
a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index cc27c7911e3..e655e668c7a 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1688,6 +1688,140 @@ struct processor_costs znver2_cost = {
"16", /* Func alignment. */
};
+struct processor_costs znver3_cost = {
+ {
+ /* Start of register allocator costs. integer->integer move cost is
+2. */
+
+ /* reg-reg moves are done by renaming and thus they are even cheaper than
+ 1 cycle. Because reg-reg move cost is 2 and following tables correspond
+ to doubles of latencies, we do not model this correctly. It does not
+ seem to make practical difference to bump prices up even more. */
+ 6, /* cost for loading QImode using
+ movzbl. */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {8, 8, 8}, /* cost of storing integer
+ registers. */
+ 2, /* cost of reg,reg fld/fst. */
+ {6, 6, 16}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode. */
+ {8, 8, 16}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode. */
+ 2, /* cost of moving MMX register. */
+ {6, 6}, /* cost of loading MMX registers
+ in SImode and DImode. */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode. */
+ 2, 2, 3, /* cost of moving XMM,YMM,ZMM
+ register. */
+ {6, 6, 6, 6, 12}, /* cost of loading SSE registers
+ in 32,64,128,256 and 512-bit. */
+ {8, 8, 8, 8, 16}, /* cost of storing SSE registers
+ in 32,64,128,256 and 512-bit. */
+ 6, 6, /* SSE->integer and integer->SSE
+ moves. */
+ 8, 8, /* mask->integer and integer->mask
moves */
+ {6, 6, 6}, /* cost of loading mask register
+ in QImode, HImode, SImode. */
+ {8, 8, 8}, /* cost if storing mask register
+ in QImode, HImode, SImode. */
+ 2, /* cost of moving mask register. */
+ /* End of register allocator costs. */ },
+
+ COSTS_N_INSNS (1), /* cost of an add instruction. */
+ COSTS_N_INSNS (1), /* cost of a lea instruction. */
+ COSTS_N_INSNS (1), /* variable shift costs. */
+ COSTS_N_INSNS (1), /* constant shift costs. */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
+ COSTS_N_INSNS (3), /* HI. */
+ COSTS_N_INSNS (3), /* SI. */
+ COSTS_N_INSNS (3), /* DI. */
+ COSTS_N_INSNS (3)}, /* other. */
+ 0, /* cost of multiply per each bit
+ set. */
+ /* Depending on parameters, idiv can get faster on ryzen. This is upper
+ bound. */
+ {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
+ COSTS_N_INSNS (22), /* HI. */
+ COSTS_N_INSNS (30), /* SI. */
+ COSTS_N_INSNS (45), /* DI. */
+ COSTS_N_INSNS (45)}, /*
other. */
+ COSTS_N_INSNS (1), /* cost of movsx. */
+ COSTS_N_INSNS (1), /* cost of movzx. */
+ 8, /* "large" insn. */
+ 9, /* MOVE_RATIO. */
+ 6, /* CLEAR_RATIO */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {8, 8, 8}, /* cost of storing integer
+ registers. */
+ {6, 6, 6, 6, 12}, /* cost of loading SSE registers
+ in 32bit, 64bit, 128bit, 256bit and
512bit */
+ {8, 8, 8, 8, 16}, /* cost of storing SSE register
+ in 32bit, 64bit, 128bit, 256bit and
512bit */
+ {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
+ {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
+ 2, 2, 3, /* cost of moving XMM,YMM,ZMM
+ register. */
+ 6, /* cost of moving SSE register to
integer. */
+ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+ throughput 12. Approx 9 uops do not depend on vector size and every load
+ is 7 uops. */
+ 18, 8, /* Gather load static, per_elt. */
+ 18, 10, /* Gather store static, per_elt. */
+ 32, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block. */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches. */
+ 3, /* Branch cost. */
+ COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (5), /* cost of FMUL instruction. */
+ /* Latency of fdiv is 8-15. */
+ COSTS_N_INSNS (15), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ /* Latency of fsqrt is 4-10. */
+ COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
+
+ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
+ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
+ COSTS_N_INSNS (3), /* cost of MULSS instruction. */
+ COSTS_N_INSNS (3), /* cost of MULSD instruction. */
+ COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
+ COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
+ COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
+ /* 9-13. */
+ COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
+ COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
+ COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
+ /* Zen can execute 4 integer operations per cycle. FP operations
+ take 3 cycles and it can execute 2 integer additions and 2
+ multiplications thus reassociation may make sense up to with of 6.
+ SPEC2k6 bencharks suggests
+ that 4 works better than 6 probably due to register pressure.
+
+ Integer vector operations are taken by FP unit and execute 3 vector
+ plus/minus operations per cycle but only one multiply. This is adjusted
+ in ix86_reassociation_width. */
+ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ znver2_memcpy,
+ znver2_memset,
+ COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
+ COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
+ "16", /* Loop alignment. */
+ "16", /* Jump alignment. */
+ "0:0:8", /* Label alignment. */
+ "16", /* Func alignment. */
+};
+
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},