Wilco Dijkstra <wilco.dijks...@arm.com> writes: > (follow-on based on review comments on > https://gcc.gnu.org/pipermail/gcc-patches/2024-January/641913.html) > > > Remove the tune AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS since it is only > used by an old core and doesn't properly support -Os. SPECINT_2017 > shows that removing it has no performance difference, while codesize > is reduced by 0.07%. > > Passes regress, OK for commit? > > gcc/ChangeLog: > * config/aarch64/aarch64.cc (aarch64_mode_valid_for_sched_fusion_p): > Remove check for AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS. > (aarch64_advsimd_ldp_stp_p): Likewise. > (aarch64_stp_sequence_cost): Likewise. > (aarch64_expand_cpymem): Likewise. > (aarch64_expand_setmem): Likewise. > * config/aarch64/aarch64-ldp-fusion.cc (ldp_operand_mode_ok_p): > Likewise. > * config/aarch64/aarch64-ldpstp.md: Likewise. > * config/aarch64/aarch64-tuning-flags.def: Remove NO_LDP_STP_QREGS. > * config/aarch64/tuning_models/emag.h: Likewise. > * config/aarch64/tuning_models/xgene1.h: Likewise. > > gcc/testsuite/ChangeLog: > * gcc.target/aarch64/ldp_stp_q_disable.c: Remove test.
OK for GCC 15 (but only once GCC 14 has branched) if no-one objects in the meantime. Thanks, Richard > --- > > diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc > b/gcc/config/aarch64/aarch64-ldp-fusion.cc > index > 22ed95eb743c9ee44e745560b207d389c8fca03b..de6685f75a2650d9a7d39fe6781ec57214092eb1 > 100644 > --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc > +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc > @@ -315,17 +315,9 @@ any_post_modify_p (rtx x) > static bool > ldp_operand_mode_ok_p (machine_mode mode) > { > - const bool allow_qregs > - = !(aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS); > - > if (!aarch64_ldpstp_operand_mode_p (mode)) > return false; > > - const auto size = GET_MODE_SIZE (mode).to_constant (); > - if (size == 16 && !allow_qregs) > - return false; > - > // We don't pair up TImode accesses before RA because TImode is > // special in that it can be allocated to a pair of GPRs or a single > // FPR, and the RA is best placed to make that decision. > diff --git a/gcc/config/aarch64/aarch64-ldpstp.md > b/gcc/config/aarch64/aarch64-ldpstp.md > index > b7c0bf05cd18c971955d667bae91d7c3dc3f512e..7890a8cc32b24f8e1bc29cb722b10e511e7881ab > 100644 > --- a/gcc/config/aarch64/aarch64-ldpstp.md > +++ b/gcc/config/aarch64/aarch64-ldpstp.md > @@ -96,9 +96,7 @@ (define_peephole2 > (set (match_operand:VQ2 2 "register_operand" "") > (match_operand:VQ2 3 "memory_operand" ""))] > "TARGET_FLOAT > - && aarch64_operands_ok_for_ldpstp (operands, true) > - && (aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0" > + && aarch64_operands_ok_for_ldpstp (operands, true)" > [(const_int 0)] > { > aarch64_finish_ldpstp_peephole (operands, true); > @@ -111,9 +109,7 @@ (define_peephole2 > (set (match_operand:VQ2 2 "memory_operand" "") > (match_operand:VQ2 3 "register_operand" ""))] > "TARGET_FLOAT > - && aarch64_operands_ok_for_ldpstp (operands, false) > - && (aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0" > + && aarch64_operands_ok_for_ldpstp (operands, false)" > [(const_int 0)] > { > aarch64_finish_ldpstp_peephole (operands, false); > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def > b/gcc/config/aarch64/aarch64-tuning-flags.def > index > d917da720b22ed6aaf360dc4ebbe8efc4a3185f2..d5bcaebce770f0b217aac783063d39135f754c77 > 100644 > --- a/gcc/config/aarch64/aarch64-tuning-flags.def > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def > @@ -36,9 +36,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", > RENAME_FMA_REGS) > are not considered cheap. */ > AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) > > -/* Disallow load/store pair instructions on Q-registers. */ > -AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS) > - > AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) > > AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 433c160cba22374f6b7a3445c0202789927abd25..d7e8379b2eb90eccb8608a15cc8d11cc2187a9e7 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -10335,9 +10335,7 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode > mode) > || mode == SDmode || mode == DDmode > || (aarch64_vector_mode_supported_p (mode) > && (known_eq (GET_MODE_SIZE (mode), 8) > - || (known_eq (GET_MODE_SIZE (mode), 16) > - && (aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0))); > + || known_eq (GET_MODE_SIZE (mode), 16))); > } > > /* Return true if REGNO is a virtual pointer register, or an eliminable > @@ -16448,10 +16446,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt > kind, > return false; > } > > - if (aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) > - return false; > - > return is_gimple_assign (stmt_info->stmt); > } > > @@ -17099,9 +17093,6 @@ aarch64_stp_sequence_cost (unsigned int count, > vect_cost_for_stmt kind, > /* Count 1 insn per vector if we can't form STP Q pairs. */ > if (aarch64_sve_mode_p (TYPE_MODE (vectype))) > return count * 2; > - if (aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) > - return count * 2; > > if (stmt_info) > { > @@ -26454,11 +26445,9 @@ aarch64_expand_cpymem (rtx *operands, bool > is_memmove) > return aarch64_expand_cpymem_mops (operands, is_memmove); > > unsigned HOST_WIDE_INT size = UINTVAL (operands[2]); > - bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS); > > /* Set inline limits for memmove/memcpy. MOPS has a separate threshold. > */ > - unsigned max_copy_size = use_ldpq ? 256 : 128; > + unsigned max_copy_size = TARGET_SIMD ? 256 : 128; > unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold > : aarch64_mops_memcpy_size_threshold; > > @@ -26470,16 +26459,13 @@ aarch64_expand_cpymem (rtx *operands, bool > is_memmove) > if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold)) > return aarch64_expand_cpymem_mops (operands, is_memmove); > > - unsigned copy_max = 32; > - > - /* Default to 32-byte LDP/STP on large copies, however small copies, no > SIMD > - support or slow LDP/STP fall back to 16-byte chunks. > + /* Default to 32-byte LDP/STP on large copies, however small copies or > + no SIMD support fall back to 16-byte chunks. > > ??? Although it would be possible to use LDP/STP Qn in streaming mode > (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear > whether that would improve performance. */ > - if (size <= 24 || !use_ldpq) > - copy_max = 16; > + unsigned copy_max = (size <= 24 || !TARGET_SIMD) ? 16 : 32; > > base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); > dst = adjust_automodify_address (dst, VOIDmode, base, 0); > @@ -26612,13 +26598,8 @@ aarch64_expand_setmem (rtx *operands) > src = expand_vector_broadcast (V16QImode, val); > src = force_reg (V16QImode, src); > > - /* Set maximum amount to write in one go. We allow 32-byte chunks based > - on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */ > - unsigned set_max = 32; > - > - if (len <= 24 || (aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)) > - set_max = 16; > + /* Set maximum number of bytes to write per instruction. */ > + unsigned set_max = (len <= 24) ? 16 : 32; > > int offset = 0; > while (len > 0) > diff --git a/gcc/config/aarch64/tuning_models/emag.h > b/gcc/config/aarch64/tuning_models/emag.h > index > cbaf8853ec4f040cab6dc82e62150d99c30ad1df..b6a9c9e2eb17e9ead91e70088a5ece0c74672e70 > 100644 > --- a/gcc/config/aarch64/tuning_models/emag.h > +++ b/gcc/config/aarch64/tuning_models/emag.h > @@ -51,7 +51,7 @@ static const struct tune_params emag_tunings = > 2, /* min_div_recip_mul_df. */ > 17, /* max_case_values. */ > tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ > - (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ > + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ > &xgene1_prefetch_tune, > AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ > AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ > diff --git a/gcc/config/aarch64/tuning_models/xgene1.h > b/gcc/config/aarch64/tuning_models/xgene1.h > index > 3301f0252609f3798b78bac18a480af8c3f0f45d..432793eba9cd66c6d3423a9443fa0534438c21b6 > 100644 > --- a/gcc/config/aarch64/tuning_models/xgene1.h > +++ b/gcc/config/aarch64/tuning_models/xgene1.h > @@ -136,7 +136,7 @@ static const struct tune_params xgene1_tunings = > 2, /* min_div_recip_mul_df. */ > 17, /* max_case_values. */ > tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ > - (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ > + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ > &xgene1_prefetch_tune, > AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ > AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c > deleted file mode 100644 > index > 38c1870c47c69175e4b641532333bf7108351476..0000000000000000000000000000000000000000 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c > +++ /dev/null > @@ -1,26 +0,0 @@ > -/* { dg-options "-O2 -moverride=tune=no_ldp_stp_qregs" } */ > - > -typedef float float32x4_t __attribute__ ((__vector_size__ ((16)))); > - > -float32x4_t arr[4][4]; > - > -void > -foo (float32x4_t x, float32x4_t y) > -{ > - arr[0][1] = x; > - arr[1][0] = y; > - arr[2][0] = x; > - arr[1][1] = y; > - arr[0][2] = x; > - arr[0][3] = y; > - arr[1][2] = x; > - arr[2][1] = y; > - arr[3][0] = x; > - arr[3][1] = y; > - arr[2][2] = x; > - arr[1][3] = y; > - arr[2][3] = x; > - arr[3][2] = y; > -} > - > -/* { dg-final { scan-assembler-not "stp\tq\[0-9\]+, q\[0-9\]" } } */