Philipp Tomsich <philipp.toms...@vrull.eu> writes: > This adds support and a basic tuning model for the Ampere Computing > "Ampere-1" CPU. > > The Ampere-1 implements the ARMv8.6 architecture in A64 mode and is > modelled as a 4-wide issue (as with all modern micro-architectures, > the chosen issue rate is a compromise between the maximum dispatch > rate and the maximum rate of uops issued to the scheduler). > > This adds the -mcpu=ampere1 command-line option and the relevant cost > information/tuning tables for the Ampere-1. > > gcc/ChangeLog: > > * config/aarch64/aarch64-cores.def (AARCH64_CORE): New Ampere-1 > core. > * config/aarch64/aarch64-tune.md: Regenerate. > * config/aarch64/aarch64-cost-tables.h: Add extra costs for > Ampere-1. > * config/aarch64/aarch64.c: Add tuning structures for Ampere-1. > * doc/invoke.texi: Add documentation for Ampere-1 core.
OK, thanks. Richard > (cherry picked from commit 67b0d47e20e655c0dd53a76ea88aab60fafb2059) > > --- > This is a backport from master and only affects the AArch64 backend. > A similar change is already backported to GCC-11. > > OK for GCC-10? > > gcc/config/aarch64/aarch64-cores.def | 3 +- > gcc/config/aarch64/aarch64-cost-tables.h | 103 +++++++++++++++++++++++ > gcc/config/aarch64/aarch64-tune.md | 2 +- > gcc/config/aarch64/aarch64.c | 59 +++++++++++++ > gcc/doc/invoke.texi | 2 +- > 5 files changed, 166 insertions(+), 3 deletions(-) > > diff --git a/gcc/config/aarch64/aarch64-cores.def > b/gcc/config/aarch64/aarch64-cores.def > index fc60e2ae1ac..3c858160cef 100644 > --- a/gcc/config/aarch64/aarch64-cores.def > +++ b/gcc/config/aarch64/aarch64-cores.def > @@ -68,7 +68,8 @@ AARCH64_CORE("octeontx83", octeontxt83, thunderx, 8A, > AARCH64_FL_FOR_ARCH > AARCH64_CORE("thunderxt81", thunderxt81, thunderx, 8A, > AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx, 0x43, > 0x0a2, -1) > AARCH64_CORE("thunderxt83", thunderxt83, thunderx, 8A, > AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx, 0x43, > 0x0a3, -1) > > -/* Ampere Computing cores. */ > +/* Ampere Computing ('\xC0') cores. */ > +AARCH64_CORE("ampere1", ampere1, cortexa57, 8_6A, AARCH64_FL_FOR_ARCH8_6, > ampere1, 0xC0, 0xac3, -1) > /* Do not swap around "emag" and "xgene1", > this order is required to handle variant correctly. */ > AARCH64_CORE("emag", emag, xgene1, 8A, AARCH64_FL_FOR_ARCH8 > | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, emag, 0x50, 0x000, 3) > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h > b/gcc/config/aarch64/aarch64-cost-tables.h > index c6805717f6e..9f9627b864e 100644 > --- a/gcc/config/aarch64/aarch64-cost-tables.h > +++ b/gcc/config/aarch64/aarch64-cost-tables.h > @@ -644,4 +644,107 @@ const struct cpu_cost_table a64fx_extra_costs = > } > }; > > +const struct cpu_cost_table ampere1_extra_costs = > +{ > + /* ALU */ > + { > + 0, /* arith. */ > + 0, /* logical. */ > + 0, /* shift. */ > + COSTS_N_INSNS (1), /* shift_reg. */ > + 0, /* arith_shift. */ > + COSTS_N_INSNS (1), /* arith_shift_reg. */ > + 0, /* log_shift. */ > + COSTS_N_INSNS (1), /* log_shift_reg. */ > + 0, /* extend. */ > + COSTS_N_INSNS (1), /* extend_arith. */ > + 0, /* bfi. */ > + 0, /* bfx. */ > + 0, /* clz. */ > + 0, /* rev. */ > + 0, /* non_exec. */ > + true /* non_exec_costs_exec. */ > + }, > + { > + /* MULT SImode */ > + { > + COSTS_N_INSNS (3), /* simple. */ > + COSTS_N_INSNS (3), /* flag_setting. */ > + COSTS_N_INSNS (3), /* extend. */ > + COSTS_N_INSNS (4), /* add. */ > + COSTS_N_INSNS (4), /* extend_add. */ > + COSTS_N_INSNS (18) /* idiv. */ > + }, > + /* MULT DImode */ > + { > + COSTS_N_INSNS (3), /* simple. */ > + 0, /* flag_setting (N/A). */ > + COSTS_N_INSNS (3), /* extend. */ > + COSTS_N_INSNS (4), /* add. */ > + COSTS_N_INSNS (4), /* extend_add. */ > + COSTS_N_INSNS (34) /* idiv. */ > + } > + }, > + /* LD/ST */ > + { > + COSTS_N_INSNS (4), /* load. */ > + COSTS_N_INSNS (4), /* load_sign_extend. */ > + 0, /* ldrd (n/a). */ > + 0, /* ldm_1st. */ > + 0, /* ldm_regs_per_insn_1st. */ > + 0, /* ldm_regs_per_insn_subsequent. */ > + COSTS_N_INSNS (5), /* loadf. */ > + COSTS_N_INSNS (5), /* loadd. */ > + COSTS_N_INSNS (5), /* load_unaligned. */ > + 0, /* store. */ > + 0, /* strd. */ > + 0, /* stm_1st. */ > + 0, /* stm_regs_per_insn_1st. */ > + 0, /* stm_regs_per_insn_subsequent. */ > + COSTS_N_INSNS (2), /* storef. */ > + COSTS_N_INSNS (2), /* stored. */ > + COSTS_N_INSNS (2), /* store_unaligned. */ > + COSTS_N_INSNS (3), /* loadv. */ > + COSTS_N_INSNS (3) /* storev. */ > + }, > + { > + /* FP SFmode */ > + { > + COSTS_N_INSNS (25), /* div. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (4), /* mult_addsub. */ > + COSTS_N_INSNS (4), /* fma. */ > + COSTS_N_INSNS (4), /* addsub. */ > + COSTS_N_INSNS (2), /* fpconst. */ > + COSTS_N_INSNS (4), /* neg. */ > + COSTS_N_INSNS (4), /* compare. */ > + COSTS_N_INSNS (4), /* widen. */ > + COSTS_N_INSNS (4), /* narrow. */ > + COSTS_N_INSNS (4), /* toint. */ > + COSTS_N_INSNS (4), /* fromint. */ > + COSTS_N_INSNS (4) /* roundint. */ > + }, > + /* FP DFmode */ > + { > + COSTS_N_INSNS (34), /* div. */ > + COSTS_N_INSNS (5), /* mult. */ > + COSTS_N_INSNS (5), /* mult_addsub. */ > + COSTS_N_INSNS (5), /* fma. */ > + COSTS_N_INSNS (5), /* addsub. */ > + COSTS_N_INSNS (2), /* fpconst. */ > + COSTS_N_INSNS (5), /* neg. */ > + COSTS_N_INSNS (5), /* compare. */ > + COSTS_N_INSNS (5), /* widen. */ > + COSTS_N_INSNS (5), /* narrow. */ > + COSTS_N_INSNS (6), /* toint. */ > + COSTS_N_INSNS (6), /* fromint. */ > + COSTS_N_INSNS (5) /* roundint. */ > + } > + }, > + /* Vector */ > + { > + COSTS_N_INSNS (3), /* alu. */ > + } > +}; > + > #endif > diff --git a/gcc/config/aarch64/aarch64-tune.md > b/gcc/config/aarch64/aarch64-tune.md > index aa68d67bdf4..737f1afa189 100644 > --- a/gcc/config/aarch64/aarch64-tune.md > +++ b/gcc/config/aarch64/aarch64-tune.md > @@ -1,5 +1,5 @@ > ;; -*- buffer-read-only: t -*- > ;; Generated automatically by gentune.sh from aarch64-cores.def > (define_attr "tune" > - > "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55" > + > "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55" > (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 1e08f4caa11..d1913d7af76 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -781,6 +781,24 @@ static const struct cpu_vector_cost a64fx_vector_cost = > 1 /* cond_not_taken_branch_cost */ > }; > > +static const cpu_vector_cost ampere1_advsimd_vector_cost = > +{ > + 1, /* scalar_int_stmt_cost */ > + 5, /* scalar_fp_stmt_cost */ > + 4, /* scalar_load_cost */ > + 2, /* scalar_store_cost */ > + 4, /* vec_int_stmt_cost */ > + 5, /* vec_fp_stmt_cost */ > + 5, /* vec_permute_cost */ > + 10, /* vec_to_scalar_cost */ > + 5, /* scalar_to_vec_cost */ > + 6, /* vec_align_load_cost */ > + 6, /* vec_unalign_load_cost */ > + 6, /* vec_unalign_store_cost */ > + 4, /* vec_store_cost */ > + 1, /* cond_taken_branch_cost */ > + 1 /* cond_not_taken_branch_cost */ > +}; > > /* Generic costs for branch instructions. */ > static const struct cpu_branch_cost generic_branch_cost = > @@ -924,6 +942,17 @@ static const cpu_prefetch_tune a64fx_prefetch_tune = > -1 /* default_opt_level */ > }; > > +static const cpu_prefetch_tune ampere1_prefetch_tune = > +{ > + 0, /* num_slots */ > + 64, /* l1_cache_size */ > + 64, /* l1_cache_line_size */ > + 2048, /* l2_cache_size */ > + true, /* prefetch_dynamic_strides */ > + -1, /* minimum_stride */ > + -1 /* default_opt_level */ > +}; > + > static const struct tune_params generic_tunings = > { > &cortexa57_extra_costs, > @@ -1384,6 +1413,36 @@ static const struct tune_params neoversen1_tunings = > &generic_prefetch_tune > }; > > +static const struct tune_params ampere1_tunings = > +{ > + &ere1_extra_costs, > + &generic_addrcost_table, > + &generic_regmove_cost, > + &ere1_advsimd_vector_cost, > + &generic_branch_cost, > + &generic_approx_modes, > + SVE_NOT_IMPLEMENTED, /* sve_width */ > + 4, /* memmov_cost */ > + 4, /* issue_rate */ > + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | > + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | > + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | > + AARCH64_FUSE_CMP_BRANCH), > + /* fusible_ops */ > + "32", /* function_align. */ > + "4", /* jump_align. */ > + "32:16", /* loop_align. */ > + 2, /* int_reassoc_width. */ > + 4, /* fp_reassoc_width. */ > + 2, /* vec_reassoc_width. */ > + 2, /* min_div_recip_mul_sf. */ > + 2, /* min_div_recip_mul_df. */ > + 0, /* max_case_values. */ > + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ > + &ere1_prefetch_tune > +}; > + > static const struct tune_params neoversev1_tunings = > { > &cortexa57_extra_costs, > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > index 81e14773f42..edb550c143d 100644 > --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -17011,7 +17011,7 @@ performance of the code. Permissible values for this > option are: > @samp{thunderxt83}, @samp{thunderx2t99}, @samp{thunderx3t110}, @samp{zeus}, > @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}, > @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53}, > -@samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55} > +@samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55}, @samp{ampere1}, > @samp{native}. > > The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},