On Tue, Jul 30, 2019 at 10:09 AM Jan Hubicka <hubi...@ucw.cz> wrote: > > > Hi, > > this patch updates znver2 costs to match reality. In particular we > > re-benchmarked memcpy strategies and it looks that glibc now wins even > > for relatively small blocks. > > Moreover I updated costs of moves to reflect that znver2 has 256 vector > > paths and faster multiplication. > > > > Bootstrapped/regtested x86_64-linux, comitted. > > > > Honza > > > > * x86-tune-costs.h (znver2_memcpy): Update. > > (znver2_costs): Update 256 bit SSE costs and multiplication. > > Hello, > I have now backported the patch to gcc 9 branch.
Thanks - can you please update changes.html for it in the 9.2 section? Richard. > Honza > > Index: config/i386/x86-tune-costs.h > > =================================================================== > > --- config/i386/x86-tune-costs.h (revision 273727) > > +++ config/i386/x86-tune-costs.h (working copy) > > @@ -1279,12 +1279,12 @@ struct processor_costs znver1_cost = { > > static stringop_algs znver2_memcpy[2] = { > > {libcall, {{6, loop, false}, {14, unrolled_loop, false}, > > {-1, rep_prefix_4_byte, false}}}, > > - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, > > + {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false}, > > {-1, libcall, false}}}}; > > static stringop_algs znver2_memset[2] = { > > {libcall, {{8, loop, false}, {24, unrolled_loop, false}, > > {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, > > - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, > > + {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, > > false}, > > {-1, libcall, false}}}}; > > > > struct processor_costs znver2_cost = { > > @@ -1335,11 +1335,11 @@ struct processor_costs znver2_cost = { > > in SImode and DImode. */ > > {8, 8}, /* cost of storing MMX registers > > in SImode and DImode. */ > > - 2, 3, 6, /* cost of moving XMM,YMM,ZMM > > + 2, 2, 3, /* cost of moving XMM,YMM,ZMM > > register. */ > > - {6, 6, 6, 10, 20}, /* cost of loading SSE registers > > + {6, 6, 6, 6, 12}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit. */ > > - {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ > > + {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ > > {8, 8, 8, 8, 16}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit. */ > > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > @@ -1372,7 +1372,7 @@ struct processor_costs znver2_cost = { > > COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ > > COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. > > */ > > COSTS_N_INSNS (3), /* cost of MULSS instruction. */ > > - COSTS_N_INSNS (4), /* cost of MULSD instruction. */ > > + COSTS_N_INSNS (3), /* cost of MULSD instruction. */ > > COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ > > COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ > > COSTS_N_INSNS (10), /* cost of DIVSS instruction. > > */