> On 26 Jul 2024, at 11:21, Tamar Christina <tamar.christ...@arm.com> wrote:
>
> External email: Use caution opening links or attachments
>
>
> Hi All,
>
> This updates the cost for Neoverse N2 to reflect the updated
> Software Optimization Guide.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
Ok.
Thanks,
Kyrill
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/aarch64/tuning_models/neoversen2.h: Update costs.
>
> ---
> diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h
> b/gcc/config/aarch64/tuning_models/neoversen2.h
> index
> be9a48ac3adc097f967c217fe09dcac194d7d14f..3430eb9c06819e00ab38966bb960bd6525ff2b5c
> 100644
> --- a/gcc/config/aarch64/tuning_models/neoversen2.h
> +++ b/gcc/config/aarch64/tuning_models/neoversen2.h
> @@ -57,7 +57,7 @@ static const advsimd_vec_cost
> neoversen2_advsimd_vector_cost =
> 2, /* ld2_st2_permute_cost */
> 2, /* ld3_st3_permute_cost */
> 3, /* ld4_st4_permute_cost */
> - 3, /* permute_cost */
> + 2, /* permute_cost */
> 4, /* reduc_i8_cost */
> 4, /* reduc_i16_cost */
> 2, /* reduc_i32_cost */
> @@ -86,28 +86,28 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
> {
> 2, /* int_stmt_cost */
> 2, /* fp_stmt_cost */
> - 3, /* ld2_st2_permute_cost */
> - 4, /* ld3_st3_permute_cost */
> - 4, /* ld4_st4_permute_cost */
> - 3, /* permute_cost */
> + 2, /* ld2_st2_permute_cost */
> + 3, /* ld3_st3_permute_cost */
> + 3, /* ld4_st4_permute_cost */
> + 2, /* permute_cost */
> /* Theoretically, a reduction involving 15 scalar ADDs could
> complete in ~5 cycles and would have a cost of 15. [SU]ADDV
> - completes in 11 cycles, so give it a cost of 15 + 6. */
> - 21, /* reduc_i8_cost */
> - /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
> - 13, /* reduc_i16_cost */
> - /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
> - 9, /* reduc_i32_cost */
> - /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
> - 2, /* reduc_i64_cost */
> + completes in 9 cycles, so give it a cost of 15 + 4. */
> + 19, /* reduc_i8_cost */
> + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5. */
> + 12, /* reduc_i16_cost */
> + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4. */
> + 7, /* reduc_i32_cost */
> + /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3. */
> + 4, /* reduc_i64_cost */
> /* Theoretically, a reduction involving 7 scalar FADDs could
> - complete in ~8 cycles and would have a cost of 14. FADDV
> - completes in 6 cycles, so give it a cost of 14 - 2. */
> - 12, /* reduc_f16_cost */
> - /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
> - 6, /* reduc_f32_cost */
> - /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
> - 2, /* reduc_f64_cost */
> + complete in ~8 cycles and would have a cost of 7. FADDV
> + completes in 6 cycles, so give it a cost of 7 + -2. */
> + 5, /* reduc_f16_cost */
> + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 3 + 0. */
> + 3, /* reduc_f32_cost */
> + /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 1 + 0. */
> + 1, /* reduc_f64_cost */
> 2, /* store_elt_extra_cost */
> /* This value is just inherited from the Cortex-A57 table. */
> 8, /* vec_to_scalar_cost */
> @@ -127,7 +127,7 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
> /* A strided Advanced SIMD x64 load would take two parallel FP loads
> (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
> is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
> - (cost 8) and a vec_construct (cost 2). Add a full vector operation
> + (cost 8) and a vec_construct (cost 4). Add a full vector operation
> (cost 2) to that, to avoid the difference being lost in rounding.
>
> There is no easy comparison between a strided Advanced SIMD x32 load
> @@ -165,14 +165,14 @@ static const aarch64_sve_vec_issue_info
> neoversen2_sve_issue_info =
> {
> {
> {
> - 3, /* loads_per_cycle */
> + 3, /* loads_stores_per_cycle */
> 2, /* stores_per_cycle */
> 2, /* general_ops_per_cycle */
> 0, /* fp_simd_load_general_ops */
> 1 /* fp_simd_store_general_ops */
> },
> 2, /* ld2_st2_general_ops */
> - 3, /* ld3_st3_general_ops */
> + 2, /* ld3_st3_general_ops */
> 3 /* ld4_st4_general_ops */
> },
> 2, /* pred_ops_per_cycle */
> @@ -190,7 +190,7 @@ static const aarch64_vec_issue_info
> neoversen2_vec_issue_info =
> &neoversen2_sve_issue_info
> };
>
> -/* Neoverse N2 costs for vector insn classes. */
> +/* Neoversen2 costs for vector insn classes. */
> static const struct cpu_vector_cost neoversen2_vector_cost =
> {
> 1, /* scalar_int_stmt_cost */
> @@ -220,7 +220,7 @@ static const struct tune_params neoversen2_tunings =
> 6, /* load_pred. */
> 1 /* store_pred. */
> }, /* memmov_cost. */
> - 3, /* issue_rate */
> + 5, /* issue_rate */
> (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
> "32:16", /* function_align. */
> "4", /* jump_align. */
> @@ -243,4 +243,4 @@ static const struct tune_params neoversen2_tunings =
> AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
> };
>
> -#endif /* GCC_AARCH64_H_NEOVERSEN2. */
> +#endif /* GCC_AARCH64_H_NEOVERSEN2. */
> \ No newline at end of file
>
>
>
>
> --
> <rb18669.patch>