This patch adds the cost model for Neoverse N1, based on the information from the "Arm Neoverse N1 Software Optimization Guide”.
-- Evandro Menezes ================================================================================ gcc/ChangeLog: * config/aarch64/aarch64-cores.def: Use the Neoverse N1 cost model. * config/aarch64/aarch64.cc (cortexa76_tunings): Rename variable. (neoversen1_addrcost_table): New variable. (neoversen1_vector_cost): Likewise. (neoversen1_regmove_cost): Likewise. (neoversen1_advsimd_vector_cost): Likewise. (neoversen1_scalar_issue_info): Likewise. (neoversen1_advsimd_issue_info): Likewise. (neoversen1_vec_issue_info): Likewise. (neoversen1_vector_cost): Likewise. (neoversen1_tunings): Likewise. * config/arm/aarch-cost-tables.h (neoversen1_extra_costs): New variable. Signed-off-by: Evandro Menezes <evan...@gcc.gnu.org> --- gcc/config/aarch64/aarch64-cores.def | 20 ++-- gcc/config/aarch64/aarch64.cc | 155 ++++++++++++++++++++++++--- gcc/config/arm/aarch-cost-tables.h | 107 ++++++++++++++++++ 3 files changed, 259 insertions(+), 23 deletions(-) diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index 2ec88c98400..e352e4077b1 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -105,17 +105,17 @@ AARCH64_CORE("thunderx2t99", thunderx2t99, thunderx2t99, V8_1A, (CRYPTO), thu /* ARM ('A') cores. */ AARCH64_CORE("cortex-a55", cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), cortexa53, 0x41, 0xd05, -1) AARCH64_CORE("cortex-a75", cortexa75, cortexa57, V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, 0xd0a, -1) -AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC, DOTPROD), neoversen1, 0x41, 0xd0b, -1) -AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1) -AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1) -AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd41, -1) -AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd42, -1) -AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, 0xd4b, -1) +AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC, DOTPROD), cortexa76, 0x41, 0xd0b, -1) +AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0e, -1) +AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1) +AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1) +AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1) +AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1) AARCH64_CORE("cortex-a65", cortexa65, cortexa53, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1) AARCH64_CORE("cortex-a65ae", cortexa65ae, cortexa53, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1) -AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1) -AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1) -AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1) +AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1) +AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1) +AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD, PROFILE), cortexa76, 0x41, 0xd0c, -1) AARCH64_CORE("neoverse-n1", neoversen1, cortexa57, V8_2A, (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1) AARCH64_CORE("neoverse-e1", neoversee1, cortexa53, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1) @@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53", cortexa73cortexa53, cortexa53, V8A, (CRC /* ARM DynamIQ big.LITTLE configurations. */ AARCH64_CORE("cortex-a75.cortex-a55", cortexa75cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE (0xd0a, 0xd05), -1) -AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), neoversen1, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1) +AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1) /* Armv8-R Architecture Processors. */ AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41, 0xd15, -1) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 42617ced73a..46710490a39 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -1867,7 +1867,7 @@ static const struct tune_params thunderx3t110_tunings = &thunderx3t110_prefetch_tune }; -static const struct tune_params neoversen1_tunings = +static const struct tune_params cortexa76_tunings = { &cortexa76_extra_costs, &generic_addrcost_table, @@ -1885,18 +1885,18 @@ static const struct tune_params neoversen1_tunings = }, /* memmov_cost. */ 3, /* issue_rate */ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */ - "32:16", /* function_align. */ - "4", /* jump_align. */ - "32:16", /* loop_align. */ - 2, /* int_reassoc_width. */ - 4, /* fp_reassoc_width. */ - 1, /* fma_reassoc_width. */ - 2, /* vec_reassoc_width. */ - 2, /* min_div_recip_mul_sf. */ - 2, /* min_div_recip_mul_df. */ - 0, /* max_case_values. */ - tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ &generic_prefetch_tune }; @@ -2293,6 +2293,135 @@ static const struct tune_params neoverse512tvb_tunings = &generic_prefetch_tune }; +static const struct cpu_addrcost_table neoversen1_addrcost_table = +{ + { + 0, /* hi */ + 0, /* si */ + 0, /* di */ + 1, /* ti */ + }, + 0, /* pre_modify */ + 0, /* post_modify */ + 1, /* post_modify_ld3_st3 */ + 1, /* post_modify_ld4_st4 */ + 0, /* register_offset */ + 0, /* register_sextend */ + 0, /* register_zextend */ + 0 /* imm_offset */ +}; + +static const struct cpu_regmove_cost neoversen1_regmove_cost = +{ + 1, /* GP2GP */ + /* Avoid the use of slow int<->fp moves for spilling by setting + their cost higher than memmov_cost. */ + 3, /* GP2FP */ + 2, /* FP2GP */ + 2 /* FP2FP */ +}; + +static const advsimd_vec_cost neoversen1_advsimd_vector_cost = +{ + 2, /* int_stmt_cost */ + 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ + 3, /* permute_cost */ + 6, /* reduc_i8_cost */ + 5, /* reduc_i16_cost */ + 3, /* reduc_i32_cost */ + 3, /* reduc_i64_cost */ + 8, /* reduc_f16_cost */ + 5, /* reduc_f32_cost */ + 5, /* reduc_f64_cost */ + 0, /* store_elt_extra_cost */ + 2, /* vec_to_scalar_cost */ + 2, /* scalar_to_vec_cost */ + 4, /* align_load_cost */ + 4, /* unalign_load_cost */ + 1, /* unalign_store_cost */ + 1 /* store_cost */ +}; + +static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info = +{ + 2, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 2, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ +}; + +static const aarch64_advsimd_vec_issue_info neoversen1_advsimd_issue_info = +{ + { + 2, /* loads_stores_per_cycle */ + 2, /* stores_per_cycle */ + 2, /* general_ops_per_cycle */ + 0, /* fp_simd_load_general_ops */ + 1 /* fp_simd_store_general_ops */ + }, + 3, /* ld2_st2_general_ops */ + 5, /* ld3_st3_general_ops */ + 11 /* ld4_st4_general_ops */ +}; + +static const aarch64_vec_issue_info neoversen1_vec_issue_info = +{ + &neoversen1_scalar_issue_info, /* scalar */ + &neoversen1_advsimd_issue_info, /* advsimd */ + nullptr /* sve */ +}; + + +static const struct cpu_vector_cost neoversen1_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &neoversen1_advsimd_vector_cost, /* advsimd */ + nullptr, /* sve */ + &neoversen1_vec_issue_info /* issue_info */ +}; + +static const struct tune_params neoversen1_tunings = +{ + &neoversen1_extra_costs, + &neoversen1_addrcost_table, + &neoversen1_regmove_cost, + &neoversen1_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_NOT_IMPLEMENTED, /* sve_width */ + { 4, /* load_int. */ + 2, /* store_int. */ + 5, /* load_fp. */ + 2, /* store_fp. */ + 4, /* load_pred. */ + 4 /* store_pred. */ + }, /* memmov_cost. */ + 4, /* issue_rate */ + AARCH64_FUSE_AES_AESMC, /* fusible_ops */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 1, /* fma_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags. */ + &generic_prefetch_tune +}; + static const advsimd_vec_cost neoversen2_advsimd_vector_cost = { 2, /* int_stmt_cost */ diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h index e3848214728..fce6da6bbcc 100644 --- a/gcc/config/arm/aarch-cost-tables.h +++ b/gcc/config/arm/aarch-cost-tables.h @@ -450,6 +450,113 @@ const struct cpu_cost_table cortexa76_extra_costs = } }; +const struct cpu_cost_table neoversen1_extra_costs = +{ + /* ALU */ + { + 0, /* arith. */ + 0, /* logical. */ + 0, /* shift. */ + 0, /* shift_reg. */ + COSTS_N_INSNS (1), /* arith_shift. */ + COSTS_N_INSNS (1), /* arith_shift_reg. */ + 0, /* log_shift. */ + COSTS_N_INSNS (1), /* log_shift_reg. */ + 0, /* extend. */ + COSTS_N_INSNS (1), /* extend_arith. */ + COSTS_N_INSNS (1), /* bfi. */ + 0, /* bfx. */ + 0, /* clz. */ + 0, /* rev. */ + 0, /* non_exec. */ + true /* non_exec_costs_exec. */ + }, + { + /* MULT SImode */ + { + COSTS_N_INSNS (1), /* simple. */ + COSTS_N_INSNS (2), /* flag_setting. */ + COSTS_N_INSNS (1), /* extend. */ + COSTS_N_INSNS (1), /* add. */ + COSTS_N_INSNS (1), /* extend_add. */ + COSTS_N_INSNS (11) /* idiv. */ + }, + /* MULT DImode */ + { + COSTS_N_INSNS (3), /* simple. */ + 0, /* flag_setting (N/A). */ + COSTS_N_INSNS (1), /* extend. */ + COSTS_N_INSNS (3), /* add. */ + COSTS_N_INSNS (1), /* extend_add. */ + COSTS_N_INSNS (19) /* idiv. */ + } + }, + /* LD/ST */ + { + COSTS_N_INSNS (3), /* load. */ + COSTS_N_INSNS (3), /* load_sign_extend. */ + COSTS_N_INSNS (3), /* ldrd. */ + COSTS_N_INSNS (2), /* ldm_1st. */ + 1, /* ldm_regs_per_insn_1st. */ + 2, /* ldm_regs_per_insn_subsequent. */ + COSTS_N_INSNS (4), /* loadf. */ + COSTS_N_INSNS (4), /* loadd. */ + COSTS_N_INSNS (3), /* load_unaligned. */ + 0, /* store. */ + 0, /* strd. */ + 0, /* stm_1st. */ + 1, /* stm_regs_per_insn_1st. */ + 2, /* stm_regs_per_insn_subsequent. */ + 0, /* storef. */ + 0, /* stored. */ + COSTS_N_INSNS (1), /* store_unaligned. */ + COSTS_N_INSNS (1), /* loadv. */ + COSTS_N_INSNS (1) /* storev. */ + }, + { + /* FP SFmode */ + { + COSTS_N_INSNS (9), /* div. */ + COSTS_N_INSNS (2), /* mult. */ + COSTS_N_INSNS (3), /* mult_addsub. */ + COSTS_N_INSNS (3), /* fma. */ + COSTS_N_INSNS (1), /* addsub. */ + COSTS_N_INSNS (1), /* fpconst. */ + 0, /* neg. */ + 0, /* compare. */ + COSTS_N_INSNS (1), /* widen. */ + COSTS_N_INSNS (1), /* narrow. */ + COSTS_N_INSNS (1), /* toint. */ + COSTS_N_INSNS (1), /* fromint. */ + COSTS_N_INSNS (1) /* roundint. */ + }, + /* FP DFmode */ + { + COSTS_N_INSNS (14), /* div. */ + COSTS_N_INSNS (2), /* mult. */ + COSTS_N_INSNS (3), /* mult_addsub. */ + COSTS_N_INSNS (3), /* fma. */ + COSTS_N_INSNS (1), /* addsub. */ + COSTS_N_INSNS (1), /* fpconst. */ + 0, /* neg. */ + 0, /* compare. */ + COSTS_N_INSNS (1), /* widen. */ + COSTS_N_INSNS (1), /* narrow. */ + COSTS_N_INSNS (1), /* toint. */ + COSTS_N_INSNS (1), /* fromint. */ + COSTS_N_INSNS (1) /* roundint. */ + } + }, + /* Vector */ + { + COSTS_N_INSNS (1), /* alu. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (1), /* dup. */ + COSTS_N_INSNS (1) /* extract. */ + } +}; + const struct cpu_cost_table exynosm1_extra_costs = { /* ALU */ -- 2.39.2 (Apple Git-143) -- Evandro Menezes ◊ evan...@yahoo.com ◊ Austin, TX Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus