Hi Evandro, I wanted to give this patch a try, but the diff seems corrupt, the whitespaces at the start of the context lines seem to have gone missing.
Could you try resending it? Thanks, Tamar > -----Original Message----- > From: Gcc-patches <gcc-patches- > bounces+tamar.christina=arm....@gcc.gnu.org> On Behalf Of Evandro > Menezes via Gcc-patches > Sent: Tuesday, April 18, 2023 10:42 PM > To: gcc-patches@gcc.gnu.org > Cc: Evandro Menezes <ebah...@icloud.com>; Richard Sandiford > <richard.sandif...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com> > Subject: [PATCH] aarch64: Add the cost model for Neoverse N1 > > This patch adds the cost model for Neoverse N1, based on the information > from the "Arm Neoverse N1 Software Optimization Guide”. > > -- > Evandro Menezes > > =================================================================== > ============= > > gcc/ChangeLog: > > * config/aarch64/aarch64-cores.def: Use the Neoverse N1 cost model. > * config/aarch64/aarch64.cc > (cortexa76_tunings): Rename variable. > (neoversen1_addrcost_table): New variable. > (neoversen1_vector_cost): Likewise. > (neoversen1_regmove_cost): Likewise. > (neoversen1_advsimd_vector_cost): Likewise. > (neoversen1_scalar_issue_info): Likewise. > (neoversen1_advsimd_issue_info): Likewise. > (neoversen1_vec_issue_info): Likewise. > (neoversen1_vector_cost): Likewise. > (neoversen1_tunings): Likewise. > * config/arm/aarch-cost-tables.h > (neoversen1_extra_costs): New variable. > > Signed-off-by: Evandro Menezes <evan...@gcc.gnu.org> > --- > gcc/config/aarch64/aarch64-cores.def | 20 ++-- > gcc/config/aarch64/aarch64.cc | 155 ++++++++++++++++++++++++--- > gcc/config/arm/aarch-cost-tables.h | 107 ++++++++++++++++++ > 3 files changed, 259 insertions(+), 23 deletions(-) > > diff --git a/gcc/config/aarch64/aarch64-cores.def > b/gcc/config/aarch64/aarch64-cores.def > index 2ec88c98400..e352e4077b1 100644 > --- a/gcc/config/aarch64/aarch64-cores.def > +++ b/gcc/config/aarch64/aarch64-cores.def > @@ -105,17 +105,17 @@ AARCH64_CORE("thunderx2t99", thunderx2t99, > thunderx2t99, V8_1A, (CRYPTO), thu > /* ARM ('A') cores. */ > AARCH64_CORE("cortex-a55", cortexa55, cortexa53, V8_2A, (F16, RCPC, > DOTPROD), cortexa53, 0x41, 0xd05, -1) AARCH64_CORE("cortex-a75", > cortexa75, cortexa57, V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, > 0xd0a, -1) -AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, > (F16, RCPC, DOTPROD), neoversen1, 0x41, 0xd0b, -1) - > AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16, > RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1) - > AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC, > DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1) -AARCH64_CORE("cortex- > a78", cortexa78, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), > neoversen1, 0x41, 0xd41, -1) -AARCH64_CORE("cortex-a78ae", cortexa78ae, > cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, > 0xd42, -1) -AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, > (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, > 0xd4b, -1) > +AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC, > +DOTPROD), cortexa76, 0x41, 0xd0b, -1) AARCH64_CORE("cortex-a76ae", > +cortexa76ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa76, > +0x41, 0xd0e, -1) AARCH64_CORE("cortex-a77", cortexa77, cortexa57, > +V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1) > +AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC, > +DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1) > +AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16, > +RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1) > +AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC, > +DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1) > AARCH64_CORE("cortex-a65", cortexa65, cortexa53, V8_2A, (F16, RCPC, > DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1) AARCH64_CORE("cortex- > a65ae", cortexa65ae, cortexa53, V8_2A, (F16, RCPC, DOTPROD, SSBS), > cortexa73, 0x41, 0xd43, -1) -AARCH64_CORE("cortex-x1", cortexx1, > cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, > 0xd44, -1) -AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, > (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1) > -AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD, > PROFILE), neoversen1, 0x41, 0xd0c, -1) > +AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC, > +DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1) > +AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC, > +DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1) > +AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD, > +PROFILE), cortexa76, 0x41, 0xd0c, -1) > AARCH64_CORE("neoverse-n1", neoversen1, cortexa57, V8_2A, (F16, RCPC, > DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1) > AARCH64_CORE("neoverse-e1", neoversee1, cortexa53, V8_2A, (F16, RCPC, > DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1) > > @@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53", > cortexa73cortexa53, cortexa53, V8A, (CRC > /* ARM DynamIQ big.LITTLE configurations. */ > > AARCH64_CORE("cortex-a75.cortex-a55", cortexa75cortexa55, cortexa53, > V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE > (0xd0a, 0xd05), -1) -AARCH64_CORE("cortex-a76.cortex-a55", > cortexa76cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), neoversen1, > 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1) > +AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53, > +V8_2A, (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE > +(0xd0b, 0xd05), -1) > > /* Armv8-R Architecture Processors. */ > AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41, > 0xd15, -1) diff --git a/gcc/config/aarch64/aarch64.cc > b/gcc/config/aarch64/aarch64.cc index 42617ced73a..46710490a39 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -1867,7 +1867,7 @@ static const struct tune_params > thunderx3t110_tunings = > &thunderx3t110_prefetch_tune > }; > > -static const struct tune_params neoversen1_tunings = > +static const struct tune_params cortexa76_tunings = > { > &cortexa76_extra_costs, > &generic_addrcost_table, > @@ -1885,18 +1885,18 @@ static const struct tune_params > neoversen1_tunings = > }, /* memmov_cost. */ > 3, /* issue_rate */ > (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* > fusible_ops */ > - "32:16", /* function_align. */ > - "4", /* jump_align. */ > - "32:16", /* loop_align. */ > - 2, /* int_reassoc_width. */ > - 4, /* fp_reassoc_width. */ > - 1, /* fma_reassoc_width. */ > - 2, /* vec_reassoc_width. */ > - 2, /* min_div_recip_mul_sf. */ > - 2, /* min_div_recip_mul_df. */ > - 0, /* max_case_values. */ > - tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ > + "32:16", /* function_align. */ > + "4", /* jump_align. */ > + "32:16", /* loop_align. */ > + 2, /* int_reassoc_width. */ > + 4, /* fp_reassoc_width. */ > + 1, /* fma_reassoc_width. */ > + 2, /* vec_reassoc_width. */ > + 2, /* min_div_recip_mul_sf. */ > + 2, /* min_div_recip_mul_df. */ > + 0, /* max_case_values. */ > + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ > &generic_prefetch_tune > }; > > @@ -2293,6 +2293,135 @@ static const struct tune_params > neoverse512tvb_tunings = > &generic_prefetch_tune > }; > > +static const struct cpu_addrcost_table neoversen1_addrcost_table = { > + { > + 0, /* hi */ > + 0, /* si */ > + 0, /* di */ > + 1, /* ti */ > + }, > + 0, /* pre_modify */ > + 0, /* post_modify */ > + 1, /* post_modify_ld3_st3 */ > + 1, /* post_modify_ld4_st4 */ > + 0, /* register_offset */ > + 0, /* register_sextend */ > + 0, /* register_zextend */ > + 0 /* imm_offset */ > +}; > + > +static const struct cpu_regmove_cost neoversen1_regmove_cost = { > + 1, /* GP2GP */ > + /* Avoid the use of slow int<->fp moves for spilling by setting > + their cost higher than memmov_cost. */ > + 3, /* GP2FP */ > + 2, /* FP2GP */ > + 2 /* FP2FP */ > +}; > + > +static const advsimd_vec_cost neoversen1_advsimd_vector_cost = { > + 2, /* int_stmt_cost */ > + 2, /* fp_stmt_cost */ > + 0, /* ld2_st2_permute_cost */ > + 0, /* ld3_st3_permute_cost */ > + 0, /* ld4_st4_permute_cost */ > + 3, /* permute_cost */ > + 6, /* reduc_i8_cost */ > + 5, /* reduc_i16_cost */ > + 3, /* reduc_i32_cost */ > + 3, /* reduc_i64_cost */ > + 8, /* reduc_f16_cost */ > + 5, /* reduc_f32_cost */ > + 5, /* reduc_f64_cost */ > + 0, /* store_elt_extra_cost */ > + 2, /* vec_to_scalar_cost */ > + 2, /* scalar_to_vec_cost */ > + 4, /* align_load_cost */ > + 4, /* unalign_load_cost */ > + 1, /* unalign_store_cost */ > + 1 /* store_cost */ > +}; > + > +static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info > += { > + 2, /* loads_stores_per_cycle */ > + 2, /* stores_per_cycle */ > + 2, /* general_ops_per_cycle */ > + 0, /* fp_simd_load_general_ops */ > + 1 /* fp_simd_store_general_ops */ > +}; > + > +static const aarch64_advsimd_vec_issue_info > +neoversen1_advsimd_issue_info = { > + { > + 2, /* loads_stores_per_cycle */ > + 2, /* stores_per_cycle */ > + 2, /* general_ops_per_cycle */ > + 0, /* fp_simd_load_general_ops */ > + 1 /* fp_simd_store_general_ops */ > + }, > + 3, /* ld2_st2_general_ops */ > + 5, /* ld3_st3_general_ops */ > + 11 /* ld4_st4_general_ops */ > +}; > + > +static const aarch64_vec_issue_info neoversen1_vec_issue_info = { > + &neoversen1_scalar_issue_info, /* scalar */ > + &neoversen1_advsimd_issue_info, /* advsimd */ > + nullptr /* sve */ > +}; > + > + > +static const struct cpu_vector_cost neoversen1_vector_cost = { > + 1, /* scalar_int_stmt_cost */ > + 1, /* scalar_fp_stmt_cost */ > + 4, /* scalar_load_cost */ > + 1, /* scalar_store_cost */ > + 1, /* cond_taken_branch_cost */ > + 1, /* cond_not_taken_branch_cost */ > + &neoversen1_advsimd_vector_cost, /* advsimd */ > + nullptr, /* sve */ > + &neoversen1_vec_issue_info /* issue_info */ }; > + > +static const struct tune_params neoversen1_tunings = { > + &neoversen1_extra_costs, > + &neoversen1_addrcost_table, > + &neoversen1_regmove_cost, > + &neoversen1_vector_cost, > + &generic_branch_cost, > + &generic_approx_modes, > + SVE_NOT_IMPLEMENTED, /* sve_width */ > + { 4, /* load_int. */ > + 2, /* store_int. */ > + 5, /* load_fp. */ > + 2, /* store_fp. */ > + 4, /* load_pred. */ > + 4 /* store_pred. */ > + }, /* memmov_cost. */ > + 4, /* issue_rate */ > + AARCH64_FUSE_AES_AESMC, /* fusible_ops */ > + "32:16", /* function_align. */ > + "4", /* jump_align. */ > + "32:16", /* loop_align. */ > + 2, /* int_reassoc_width. */ > + 4, /* fp_reassoc_width. */ > + 1, /* fma_reassoc_width. */ > + 2, /* vec_reassoc_width. */ > + 2, /* min_div_recip_mul_sf. */ > + 2, /* min_div_recip_mul_df. */ > + 0, /* max_case_values. */ > + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > + AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags. */ > + &generic_prefetch_tune > +}; > + > static const advsimd_vec_cost neoversen2_advsimd_vector_cost = { > 2, /* int_stmt_cost */ > diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost- > tables.h > index e3848214728..fce6da6bbcc 100644 > --- a/gcc/config/arm/aarch-cost-tables.h > +++ b/gcc/config/arm/aarch-cost-tables.h > @@ -450,6 +450,113 @@ const struct cpu_cost_table cortexa76_extra_costs > = > } > }; > > +const struct cpu_cost_table neoversen1_extra_costs = { > + /* ALU */ > + { > + 0, /* arith. */ > + 0, /* logical. */ > + 0, /* shift. */ > + 0, /* shift_reg. */ > + COSTS_N_INSNS (1), /* arith_shift. */ > + COSTS_N_INSNS (1), /* arith_shift_reg. */ > + 0, /* log_shift. */ > + COSTS_N_INSNS (1), /* log_shift_reg. */ > + 0, /* extend. */ > + COSTS_N_INSNS (1), /* extend_arith. */ > + COSTS_N_INSNS (1), /* bfi. */ > + 0, /* bfx. */ > + 0, /* clz. */ > + 0, /* rev. */ > + 0, /* non_exec. */ > + true /* non_exec_costs_exec. */ > + }, > + { > + /* MULT SImode */ > + { > + COSTS_N_INSNS (1), /* simple. */ > + COSTS_N_INSNS (2), /* flag_setting. */ > + COSTS_N_INSNS (1), /* extend. */ > + COSTS_N_INSNS (1), /* add. */ > + COSTS_N_INSNS (1), /* extend_add. */ > + COSTS_N_INSNS (11) /* idiv. */ > + }, > + /* MULT DImode */ > + { > + COSTS_N_INSNS (3), /* simple. */ > + 0, /* flag_setting (N/A). */ > + COSTS_N_INSNS (1), /* extend. */ > + COSTS_N_INSNS (3), /* add. */ > + COSTS_N_INSNS (1), /* extend_add. */ > + COSTS_N_INSNS (19) /* idiv. */ > + } > + }, > + /* LD/ST */ > + { > + COSTS_N_INSNS (3), /* load. */ > + COSTS_N_INSNS (3), /* load_sign_extend. */ > + COSTS_N_INSNS (3), /* ldrd. */ > + COSTS_N_INSNS (2), /* ldm_1st. */ > + 1, /* ldm_regs_per_insn_1st. */ > + 2, /* ldm_regs_per_insn_subsequent. */ > + COSTS_N_INSNS (4), /* loadf. */ > + COSTS_N_INSNS (4), /* loadd. */ > + COSTS_N_INSNS (3), /* load_unaligned. */ > + 0, /* store. */ > + 0, /* strd. */ > + 0, /* stm_1st. */ > + 1, /* stm_regs_per_insn_1st. */ > + 2, /* stm_regs_per_insn_subsequent. */ > + 0, /* storef. */ > + 0, /* stored. */ > + COSTS_N_INSNS (1), /* store_unaligned. */ > + COSTS_N_INSNS (1), /* loadv. */ > + COSTS_N_INSNS (1) /* storev. */ > + }, > + { > + /* FP SFmode */ > + { > + COSTS_N_INSNS (9), /* div. */ > + COSTS_N_INSNS (2), /* mult. */ > + COSTS_N_INSNS (3), /* mult_addsub. */ > + COSTS_N_INSNS (3), /* fma. */ > + COSTS_N_INSNS (1), /* addsub. */ > + COSTS_N_INSNS (1), /* fpconst. */ > + 0, /* neg. */ > + 0, /* compare. */ > + COSTS_N_INSNS (1), /* widen. */ > + COSTS_N_INSNS (1), /* narrow. */ > + COSTS_N_INSNS (1), /* toint. */ > + COSTS_N_INSNS (1), /* fromint. */ > + COSTS_N_INSNS (1) /* roundint. */ > + }, > + /* FP DFmode */ > + { > + COSTS_N_INSNS (14), /* div. */ > + COSTS_N_INSNS (2), /* mult. */ > + COSTS_N_INSNS (3), /* mult_addsub. */ > + COSTS_N_INSNS (3), /* fma. */ > + COSTS_N_INSNS (1), /* addsub. */ > + COSTS_N_INSNS (1), /* fpconst. */ > + 0, /* neg. */ > + 0, /* compare. */ > + COSTS_N_INSNS (1), /* widen. */ > + COSTS_N_INSNS (1), /* narrow. */ > + COSTS_N_INSNS (1), /* toint. */ > + COSTS_N_INSNS (1), /* fromint. */ > + COSTS_N_INSNS (1) /* roundint. */ > + } > + }, > + /* Vector */ > + { > + COSTS_N_INSNS (1), /* alu. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (1), /* dup. */ > + COSTS_N_INSNS (1) /* extract. */ > + } > +}; > + > const struct cpu_cost_table exynosm1_extra_costs = { > /* ALU */ > -- > 2.39.2 (Apple Git-143) > > > > > -- > Evandro Menezes ◊ evan...@yahoo.com ◊ Austin, TX > Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus