On Thu, 11 Oct 2018, Richard Biener wrote: > > The following fixes vec_construct cost calculation to properly consider > that the inserts will happen to SSE regs thus forgo the multiplication > done in ix86_vec_cost which is passed the wrong mode. This gets rid of > the only call passing false to ix86_vec_cost (so consider the patch > amended to remove the arg if approved). > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > OK for trunk? > > I am considering to make the factor we apply in ix86_vec_cost > which currently depends on X86_TUNE_AVX128_OPTIMAL and > X86_TUNE_SSE_SPLIT_REGS part of the actual cost tables since > the reason we apply them are underlying CPU architecture details. > Was the original reason of doing the multiplication based on > those tunings to be able to "share" the same basic cost table > across architectures that differ in this important detail? > I see X86_TUNE_SSE_SPLIT_REGS is only used for m_ATHLON_K8 > and X86_TUNE_AVX128_OPTIMAL is used for m_BDVER, m_BTVER2 > and m_ZNVER1. Those all have (multiple) exclusive processor_cost_table > entries. > > As a first step I'd like to remove the use of ix86_vec_cost for > the entries that already have entries for multiple modes > (loads and stores) and apply the factor there. For example > Zen can do two 128bit loads per cycle but only one 128bit store. > With multiplying AVX256 costs by two we seem to cost sth like > # instructions to dispatch * instruction latency which is an > odd thing. I'd have expected # instructions to dispatch / instruction > throughput * instruction latency - so a AVX256 add would cost > the same as a AVX128 add, likewise for loads but stores would be > more expensive because of the throughput issue. This all > ignores resource utilization across multiple insns but that's > how the cost model works ...
So like the following which removes the use of ix86_vec_cost for SSE loads and stores since we have per-mode costs already. I've applied the relevant factor to the individual cost tables (noting that for X86_TUNE_SSE_SPLIT_REGS we only apply the multiplication for size == 128, not size >= 128 ...) There's a ??? hunk in inline_memory_move_cost where we failed to apply the scaling thus in that place we'd now have a behavior change. Alternatively I could leave the cost tables unaltered if that costing part is more critical than the vectorizer one. I've also spotted, when reviewing ix86_vec_cost uses, a bug in ix86_rtx_cost which keys on SFmode which doesn't work for SSE modes, thus use GET_MODE_INNER. Also I've changed X86_TUNE_AVX128_OPTIMAL to also apply to BTVER1 - everywhere else we glob BTVER1 and BTVER2 so this must surely be a omission. Honza - is a patch like this OK? Should I split out individual fixes to make bisection possible? Should I update the cost tables or instead change the vectorizer costing when considering the inline_memory_move_cost "issue"? Thanks, Richard. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 0cf4152acb2..f5392232f61 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -39432,6 +39432,7 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int index = sse_store_index (mode); if (index == -1) return 100; + /* ??? */ if (in == 2) return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]); return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index]; @@ -40183,7 +40181,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F); *total = ix86_vec_cost (mode, - mode == SFmode ? cost->fmass : cost->fmasd, + GET_MODE_INNER (mode) == SFmode + ? cost->fmass : cost->fmasd, true); *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed); @@ -45122,18 +45121,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, /* See PR82713 - we may end up being called on non-vector type. */ if (index < 0) index = 2; - return ix86_vec_cost (mode, - COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2, - true); + return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2; case vector_store: index = sse_store_index (mode); /* See PR82713 - we may end up being called on non-vector type. */ if (index < 0) index = 2; - return ix86_vec_cost (mode, - COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2, - true); + return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2; case vec_to_scalar: case scalar_to_vec: @@ -45146,20 +45141,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, /* See PR82713 - we may end up being called on non-vector type. */ if (index < 0) index = 2; - return ix86_vec_cost (mode, - COSTS_N_INSNS - (ix86_cost->sse_unaligned_load[index]) / 2, - true); + return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2; case unaligned_store: index = sse_store_index (mode); /* See PR82713 - we may end up being called on non-vector type. */ if (index < 0) index = 2; - return ix86_vec_cost (mode, - COSTS_N_INSNS - (ix86_cost->sse_unaligned_store[index]) / 2, - true); + return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2; case vector_gather_load: return ix86_vec_cost (mode, diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 9c8ae0a7841..59d0a8b17d0 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -795,12 +795,12 @@ struct processor_costs athlon_cost = { {4, 4}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 4, 6, 12, 24}, /* cost of loading SSE registers + {4, 4, 12, 12, 24}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {4, 4, 6, 12, 24}, /* cost of unaligned loads. */ - {4, 4, 5, 10, 20}, /* cost of storing SSE registers + {4, 4, 12, 12, 24}, /* cost of unaligned loads. */ + {4, 4, 10, 10, 20}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ + {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 5, 5, /* SSE->integer and integer->SSE moves */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ @@ -891,12 +891,12 @@ struct processor_costs k8_cost = { {4, 4}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {4, 3, 6, 12, 24}, /* cost of loading SSE registers + {4, 3, 12, 12, 24}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {4, 3, 6, 12, 24}, /* cost of unaligned loads. */ - {4, 4, 5, 10, 20}, /* cost of storing SSE registers + {4, 3, 12, 12, 24}, /* cost of unaligned loads. */ + {4, 4, 10, 10, 20}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ + {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 5, 5, /* SSE->integer and integer->SSE moves */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ @@ -1100,12 +1100,12 @@ const struct processor_costs bdver1_cost = { {10, 10}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {12, 12, 10, 20, 30}, /* cost of loading SSE registers + {12, 12, 10, 40, 60}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ - {10, 10, 10, 20, 30}, /* cost of storing SSE registers + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ + {10, 10, 10, 40, 60}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ @@ -1203,12 +1203,12 @@ const struct processor_costs bdver2_cost = { {10, 10}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {12, 12, 10, 20, 30}, /* cost of loading SSE registers + {12, 12, 10, 40, 60}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ - {10, 10, 10, 20, 30}, /* cost of storing SSE registers + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ + {10, 10, 10, 40, 60}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ @@ -1305,12 +1305,12 @@ struct processor_costs bdver3_cost = { {10, 10}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {12, 12, 10, 20, 30}, /* cost of loading SSE registers + {12, 12, 10, 40, 60}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ - {10, 10, 10, 20, 30}, /* cost of storing SSE registers + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ + {10, 10, 10, 40, 60}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ @@ -1406,12 +1406,12 @@ struct processor_costs bdver4_cost = { {10, 10}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {12, 12, 10, 20, 30}, /* cost of loading SSE registers + {12, 12, 10, 40, 60}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ - {10, 10, 10, 20, 30}, /* cost of storing SSE registers + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ + {10, 10, 10, 40, 60}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ @@ -1518,12 +1518,12 @@ struct processor_costs znver1_cost = { {8, 8}, /* cost of storing MMX registers in SImode and DImode. */ 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ - {6, 6, 6, 6, 12}, /* cost of loading SSE registers + {6, 6, 6, 12, 24}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit. */ - {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ - {8, 8, 8, 8, 16}, /* cost of storing SSE registers + {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ + {8, 8, 8, 16, 32}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit. */ - {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ + {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 6, 6, /* SSE->integer and integer->SSE moves. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load @@ -1726,12 +1726,12 @@ const struct processor_costs btver1_cost = { {12, 12}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {10, 10, 12, 24, 48}, /* cost of loading SSE registers + {10, 10, 12, 48, 96}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ - {10, 10, 12, 24, 48}, /* cost of storing SSE registers + {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ + {10, 10, 12, 48, 96}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ + {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 14, 14, /* SSE->integer and integer->SSE moves */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ @@ -1817,12 +1817,12 @@ const struct processor_costs btver2_cost = { {12, 12}, /* cost of storing MMX registers in SImode and DImode */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ - {10, 10, 12, 24, 48}, /* cost of loading SSE registers + {10, 10, 12, 48, 96}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ - {10, 10, 12, 24, 48}, /* cost of storing SSE registers + {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ + {10, 10, 12, 48, 96}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ - {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ + {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 14, 14, /* SSE->integer and integer->SSE moves */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index a46450ad99d..ad8661075aa 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -441,7 +441,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal" /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for the auto-vectorizer. */ -DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 +DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER | m_ZNVER1) /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX