Hi, Znver5 has addss cost of 2 while other common floating point SSE operations costs 3 cycles. We currently have only one entry in the costs tables which makes it impossible to model this. This patch adds sse_fp_op which is used for other common FP operations (basically conversions) and updates code computing costs.
The logic is that typical integer SSE operation (say addition) is 1 cycle and that correspond to sse_op. "Typical" SSE FP operation i.e. one we do not have separate cost entry for (i.e. cvtss2sd) is 3 cycles. Looking across the costing code, there are few things that I think makes sense to work on incrementally. - add_stmt_cost acconts max/min as sse_op (1) while it is 2 for FP. This will need extra entry - add_stmt_cost does not seem to special case sqrt, FP->FP conversions and int<->fp conversions that are all bit different. - There is also problem in a way how constructors are modeled, since integer->sse move is accounted as addss (now fp_op) while it probably should be derived from integer_to_sse cost (on Zen, it is more expensive than usual FP operation and we already have cost entry for it) Again I guess a problem here is that we do not really know what we are constructing and if the specific field of vector is, say, constant or something doable in SSE register, we won't need to pay cost for inter-unit move. Bootstrapped/regtested x86_64-linux. Richi, I wonder if this makes sense to you? I know you plan to change vectorizer cost code this stage1... gcc/ChangeLog: PR target/119298 * config/i386/i386.cc (ix86_rtx_costs): Use sse_fp_op. (ix86_builtin_vectorization_cost): Use sse_fp_op. * config/i386/i386.h (struct processor_costs): Add sse_fp_op. * config/i386/x86-tune-costs.h (struct processor_costs): Update constructors diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index b172f716c68..3e8106bdd31 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22482,14 +22482,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) *total = 0; else - *total = ix86_vec_cost (mode, cost->addss); + *total = ix86_vec_cost (mode, cost->sse_fp_op); return false; case FLOAT_TRUNCATE: if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) *total = cost->fadd; else - *total = ix86_vec_cost (mode, cost->addss); + *total = ix86_vec_cost (mode, cost->sse_fp_op); return false; case ABS: @@ -24675,7 +24675,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, switch (type_of_cost) { case scalar_stmt: - return fp ? ix86_cost->addss : COSTS_N_INSNS (1); + return fp ? ix86_cost->sse_fp_op : COSTS_N_INSNS (1); case scalar_load: /* load/store costs are relative to register move which is 2. Recompute @@ -24689,7 +24689,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vector_stmt: return ix86_vec_cost (mode, - fp ? ix86_cost->addss : ix86_cost->sse_op); + fp ? ix86_cost->sse_fp_op : ix86_cost->sse_op); case vector_load: index = sse_store_index (mode); @@ -24759,12 +24759,12 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, /* One vinserti128 for combining two SSE vectors for AVX256. */ else if (GET_MODE_BITSIZE (mode) == 256) return ((n - 2) * ix86_cost->sse_op - + ix86_vec_cost (mode, ix86_cost->addss)); + + ix86_vec_cost (mode, ix86_cost->sse_fp_op)); /* One vinserti64x4 and two vinserti128 for combining SSE and AVX256 vectors to AVX512. */ else if (GET_MODE_BITSIZE (mode) == 512) return ((n - 4) * ix86_cost->sse_op - + 3 * ix86_vec_cost (mode, ix86_cost->addss)); + + 3 * ix86_vec_cost (mode, ix86_cost->sse_fp_op)); gcc_unreachable (); } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 8507243d726..bb3620731ec 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -198,6 +198,8 @@ struct processor_costs { /* Specify what algorithm to use for stringops on unknown size. */ const int sse_op; /* cost of cheap SSE instruction. */ + const int sse_fp_op; /* cost of typical SSE FP instruction not + listed below (such as conversion). */ const int addss; /* cost of ADDSS/SD SUBSS/SD instructions. */ const int mulss; /* cost of MULSS instructions. */ const int mulsd; /* cost of MULSD instructions. */ diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 9477345bdd7..d7f0f19ec55 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -122,6 +122,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ + COSTS_N_BYTES (2), /* cost of SSE FP instruction. */ COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_BYTES (2), /* cost of MULSS instruction. */ COSTS_N_BYTES (2), /* cost of MULSD instruction. */ @@ -234,6 +235,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (23), /* cost of SSE FP instruction. */ COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (27), /* cost of MULSS instruction. */ COSTS_N_INSNS (27), /* cost of MULSD instruction. */ @@ -347,6 +349,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (8), /* cost of SSE FP instruction. */ COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (16), /* cost of MULSS instruction. */ COSTS_N_INSNS (16), /* cost of MULSD instruction. */ @@ -458,6 +461,7 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ @@ -562,6 +566,7 @@ struct processor_costs lakemont_cost = { COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (5), /* cost of SSE FP instruction. */ COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (5), /* cost of MULSS instruction. */ COSTS_N_INSNS (5), /* cost of MULSD instruction. */ @@ -681,6 +686,7 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -791,6 +797,7 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (6), /* cost of SSE FP instruction. */ COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (11), /* cost of MULSS instruction. */ COSTS_N_INSNS (11), /* cost of MULSD instruction. */ @@ -904,6 +911,7 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (2), /* cost of SSE FP instruction. */ COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (2), /* cost of MULSS instruction. */ COSTS_N_INSNS (2), /* cost of MULSD instruction. */ @@ -1017,6 +1025,7 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (4), /* cost of SSE FP instruction. */ COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -1140,6 +1149,7 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (4), /* cost of SSE FP instruction. */ COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -1271,6 +1281,7 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (4), /* cost of SSE FP instruction. */ COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -1394,6 +1405,7 @@ const struct processor_costs bdver_cost = { COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (6), /* cost of SSE FP instruction. */ COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (6), /* cost of MULSS instruction. */ COSTS_N_INSNS (6), /* cost of MULSD instruction. */ @@ -1543,6 +1555,7 @@ struct processor_costs znver1_cost = { COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -1702,6 +1715,7 @@ struct processor_costs znver2_cost = { COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ @@ -1837,6 +1851,7 @@ struct processor_costs znver3_cost = { COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ @@ -1974,6 +1989,7 @@ struct processor_costs znver4_cost = { COSTS_N_INSNS (25), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ @@ -2039,7 +2055,7 @@ struct processor_costs znver5_cost = { in 32,64,128,256 and 512-bit. */ {8, 8, 8, 12, 12}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit. */ - 6, 8, /* SSE->integer and integer->SSE + 7, 9, /* SSE->integer and integer->SSE moves. */ 8, 8, /* mask->integer and integer->mask moves */ {6, 6, 6}, /* cost of loading mask register @@ -2118,9 +2134,10 @@ struct processor_costs znver5_cost = { /* SSE instructions have typical throughput 4 and latency 1. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ /* ADDSS has throughput 2 and latency 2 (in some cases when source is another addition). */ - COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ /* MULSS has throughput 2 and latency 3. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ @@ -2265,6 +2282,7 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (4), /* cost of SSE FP instruction. */ COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -2394,6 +2412,7 @@ struct processor_costs icelake_cost = { COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (4), /* cost of SSE FP instruction. */ COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -2517,6 +2536,7 @@ struct processor_costs alderlake_cost = { COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (5), /* cost of MULSD instruction. */ @@ -2633,6 +2653,7 @@ const struct processor_costs btver1_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (2), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -2746,6 +2767,7 @@ const struct processor_costs btver2_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (2), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -2858,6 +2880,7 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (4), /* cost of SSE FP instruction. */ COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (6), /* cost of MULSS instruction. */ COSTS_N_INSNS (6), /* cost of MULSD instruction. */ @@ -2973,6 +2996,7 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (5), /* cost of SSE FP instruction. */ COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (7), /* cost of MULSS instruction. */ COSTS_N_INSNS (7), /* cost of MULSD instruction. */ @@ -3086,6 +3110,7 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (5), /* cost of SSE FP instruction. */ COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (5), /* cost of MULSD instruction. */ @@ -3199,6 +3224,7 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (5), /* cost of MULSD instruction. */ @@ -3326,6 +3352,7 @@ struct processor_costs tremont_cost = { COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (5), /* cost of MULSD instruction. */ @@ -3439,6 +3466,7 @@ struct processor_costs intel_cost = { COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (8), /* cost of SSE FP instruction. */ COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (8), /* cost of MULSS instruction. */ COSTS_N_INSNS (8), /* cost of MULSD instruction. */ @@ -3557,6 +3585,7 @@ struct processor_costs lujiazui_cost = { COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (4), /* cost of MULSD instruction. */ @@ -3673,6 +3702,7 @@ struct processor_costs yongfeng_cost = { COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ @@ -3789,6 +3819,7 @@ struct processor_costs shijidadao_cost = { COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ @@ -3913,6 +3944,7 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (5), /* cost of MULSD instruction. */ @@ -4042,6 +4074,7 @@ struct processor_costs core_cost = { COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of SSE FP instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ COSTS_N_INSNS (4), /* cost of MULSS instruction. */ COSTS_N_INSNS (5), /* cost of MULSD instruction. */