Hi, the following patch makes FP sqrt and div costs to match better modern chips. It enables vectorization of internal loops in spec2006 cactusAMD and leslie3d with over 10% speedup on Zen and also on Haswell.
Bootstrapped/regtestex x86_64-linux, comitted. Honza PR target/81616 * x86-tune-costs.h (generic_cost): Reduce cost of FDIV 20->17, cost of sqrt 20->14, DIVSS 18->13, DIVSD 32->17, SQRtSS 30->14 and SQRTsD 58->18, cond_not_taken_branch_cost. 2->1. Increase cond_taken_branch_cost 3->4. Index: config/i386/x86-tune-costs.h =================================================================== --- config/i386/x86-tune-costs.h (revision 256065) +++ config/i386/x86-tune-costs.h (working copy) @@ -2293,10 +2293,10 @@ struct processor_costs generic_cost = { 3, /* Branch cost */ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (5), /* cost of FMUL instruction. */ - COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (17), /* cost of FDIV instruction. */ COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ - COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ + COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ @@ -2304,15 +2304,15 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (5), /* cost of MULSD instruction. */ COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ - COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ - COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ - COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ - COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ + COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ + COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ generic_memcpy, generic_memset, - COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ - COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ + COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ }; /* core_cost should produce code tuned for Core familly of CPUs. */