On Thu, Oct 31, 2024 at 10:07 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > Wilco Dijkstra <wilco.dijks...@arm.com> writes: > > The early scheduler takes up ~33% of the total build time, however it > > doesn't > > provide a meaningful performance gain. This is partly because modern OoO > > cores > > need far less scheduling, partly because the scheduler tends to create many > > unnecessary spills by increasing register pressure. Building applications > > 56% faster is far more useful than ~0.1% improvement on SPEC, so switch off > > early scheduling on AArch64. Codesize reduces by ~0.2%. > > > > The combine_and_move pass runs if the scheduler is disabled and aggressively > > combines moves. The movsf/df patterns allow all FP immediates since they > > rely on a split pattern, however splits do not happen this late. To fix > > this, > > use a more accurate check that blocks creation of literal loads during > > combine_and_move. Fix various tests that depend on scheduling by explicitly > > adding -fschedule-insns. > > > > Passes bootstrap & regress, OK for commit? > > I'm in favour of this. Obviously the numbers are what count, but > also from first principles: > > - I can't remember the last time a scheduling model was added to the port.
We have one internally for oryon-1 but I have not had time to benchmark with it vs without it yet but I suspect it won't help enough to even think about upstreaming it. I think the last model added was tsv110.md in 2020. > > - We've (consciously) never added scheduling types for SVE. > > - It doesn't make logical sense to schedule for Neoverse V3 (say) > as thought it were a Cortex A57. > > So at this point, it seems better for scheduling to be opt-in rather > than opt-out. (That is, we can switch to a tune-based default if > anyone does add a new scheduling model in future.) > > Let's see what others think. > > Please split the md changes out into a separate pre-patch though. > > What do you think about disabling late scheduling as well? EBB scheduling can actually help (after register allocation) due to moving things before branches and even with branch prediction on modern hardware being decent because sometimes the HW gets confused. Thanks, Andrew Pinski > > Thanks, > Richard > > > gcc/ChangeLog: > > * common/config/aarch64/aarch64-common.cc: Switch off > > fschedule_insns. > > * config/aarch64/aarch64.md (movhf_aarch64): Use > > aarch64_valid_fp_move. > > (movsf_aarch64): Likewise. > > (movdf_aarch64): Likewise. > > * config/aarch64/aarch64.cc (aarch64_valid_fp_move): New function. > > * config/aarch64/aarch64-protos.h (aarch64_valid_fp_move): Likewise. > > > > gcc/testsuite/ChangeLog: > > * testsuite/gcc.target/aarch64/ldp_aligned.c: Fix test. > > * testsuite/gcc.target/aarch64/ldp_always.c: Likewise. > > * testsuite/gcc.target/aarch64/ldp_stp_10.c: Add -fschedule-insns. > > * testsuite/gcc.target/aarch64/ldp_stp_12.c: Likewise. > > * testsuite/gcc.target/aarch64/ldp_stp_13.c: Remove test. > > * testsuite/gcc.target/aarch64/ldp_stp_21.c: Add -fschedule-insns. > > * testsuite/gcc.target/aarch64/ldp_stp_8.c: Likewise. > > * testsuite/gcc.target/aarch64/ldp_vec_v2sf.c: Likewise. > > * testsuite/gcc.target/aarch64/ldp_vec_v2si.c: Likewise. > > * testsuite/gcc.target/aarch64/test_frame_16.c: Fix test. > > * testsuite/gcc.target/aarch64/sve/vcond_12.c: Add -fschedule-insns. > > * testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c: Likewise. > > > > --- > > > > diff --git a/gcc/common/config/aarch64/aarch64-common.cc > > b/gcc/common/config/aarch64/aarch64-common.cc > > index > > 2bfc597e333b6018970a9ee6e370a66b6d0960ef..845747e31e821c2f3970fd39ea70f046eddbe920 > > 100644 > > --- a/gcc/common/config/aarch64/aarch64-common.cc > > +++ b/gcc/common/config/aarch64/aarch64-common.cc > > @@ -54,6 +54,8 @@ static const struct default_options > > aarch_option_optimization_table[] = > > { OPT_LEVELS_ALL, OPT_fomit_frame_pointer, NULL, 0 }, > > /* Enable -fsched-pressure by default when optimizing. */ > > { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 }, > > + /* Disable early scheduling due to high compile-time overheads. */ > > + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, > > /* Enable redundant extension instructions removal at -O2 and higher. > > */ > > { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 }, > > { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL }, > > diff --git a/gcc/config/aarch64/aarch64-protos.h > > b/gcc/config/aarch64/aarch64-protos.h > > index > > 250c5b96a21ea1c969a0e77e420525eec90e4de4..b30329d7f85f5b962dca43cf12ca938898425874 > > 100644 > > --- a/gcc/config/aarch64/aarch64-protos.h > > +++ b/gcc/config/aarch64/aarch64-protos.h > > @@ -758,6 +758,7 @@ bool aarch64_advsimd_struct_mode_p (machine_mode mode); > > opt_machine_mode aarch64_vq_mode (scalar_mode); > > opt_machine_mode aarch64_full_sve_mode (scalar_mode); > > bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode); > > +bool aarch64_valid_fp_move (rtx, rtx, machine_mode); > > bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT); > > bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT, > > HOST_WIDE_INT); > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index > > 2647293f7cf020378dacc37b7bfbccc856573e44..965ec18412a6486e6ac4ff2e4a7d742bf61e5d75 > > 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -11223,6 +11223,36 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode > > mode) > > return aarch64_simd_valid_mov_imm (v_op); > > } > > > > +/* Return TRUE if DST and SRC with mode MODE is a valid fp move. */ > > +bool > > +aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode) > > +{ > > + if (!TARGET_FLOAT) > > + return false; > > + > > + if (aarch64_reg_or_fp_zero (src, mode)) > > + return true; > > + > > + if (!register_operand (dst, mode)) > > + return false; > > + > > + if (MEM_P (src)) > > + return true; > > + > > + if (!DECIMAL_FLOAT_MODE_P (mode)) > > + { > > + if (aarch64_can_const_movi_rtx_p (src, mode) > > + || aarch64_float_const_representable_p (src) > > + || aarch64_float_const_zero_rtx_p (src)) > > + return true; > > + > > + /* This requires a split which is only allowed before regalloc. */ > > + if (aarch64_float_const_rtx_p (src)) > > + return can_create_pseudo_p () && !ira_in_progress; > > + } > > + > > + return can_create_pseudo_p (); > > +} > > > > /* Return the fixed registers used for condition codes. */ > > > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > > index > > 20956fc49d8232763b127629ded17037ad7d7960..5d3fa9628952031f52474291e160b957d774b011 > > 100644 > > --- a/gcc/config/aarch64/aarch64.md > > +++ b/gcc/config/aarch64/aarch64.md > > @@ -1644,8 +1644,7 @@ (define_expand "mov<mode>" > > (define_insn "*mov<mode>_aarch64" > > [(set (match_operand:HFBF 0 "nonimmediate_operand") > > (match_operand:HFBF 1 "general_operand"))] > > - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode) > > - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))" > > + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)" > > {@ [ cons: =0 , 1 ; attrs: type , arch ] > > [ w , Y ; neon_move , simd ] movi\t%0.4h, #0 > > [ w , ?rY ; f_mcr , fp16 ] fmov\t%h0, %w1 > > @@ -1668,8 +1667,7 @@ (define_insn "*mov<mode>_aarch64" > > (define_insn "*mov<mode>_aarch64" > > [(set (match_operand:SFD 0 "nonimmediate_operand") > > (match_operand:SFD 1 "general_operand"))] > > - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode) > > - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))" > > + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)" > > {@ [ cons: =0 , 1 ; attrs: type , arch ] > > [ w , Y ; neon_move , simd ] movi\t%0.2s, #0 > > [ w , ?rY ; f_mcr , * ] fmov\t%s0, %w1 > > @@ -1689,8 +1687,7 @@ (define_insn "*mov<mode>_aarch64" > > (define_insn "*mov<mode>_aarch64" > > [(set (match_operand:DFD 0 "nonimmediate_operand") > > (match_operand:DFD 1 "general_operand"))] > > - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode) > > - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))" > > + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)" > > {@ [ cons: =0 , 1 ; attrs: type , arch ] > > [ w , Y ; neon_move , simd ] movi\t%d0, #0 > > [ w , ?rY ; f_mcr , * ] fmov\t%d0, %x1 > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > > index > > 75495d71df28235b2bb2dc634c3e5121d398bac2..8ec2b0392b80d4c0d8b47a512ba291e3bade3be3 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > > @@ -14,25 +14,11 @@ TYPE ldp_aligned_##TYPE(char* ptr){ \ > > return a_0 + a_1; \ > > } > > > > -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \ > > -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \ > > - TYPE a_0, a_1, a_2, a_3, a_4; \ > > - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \ > > - a_0 = arr[100]; \ > > - a_1 = arr[101]; \ > > - a_2 = arr[102]; \ > > - a_3 = arr[103]; \ > > - a_4 = arr[110]; \ > > - return a_0 + a_1 + a_2 + a_3 + a_4; \ > > -} > > - > > LDP_TEST_ALIGNED(int32_t); > > LDP_TEST_ALIGNED(int64_t); > > LDP_TEST_ALIGNED(v4si); > > -LDP_TEST_ADJUST_ALIGNED(int32_t); > > -LDP_TEST_ADJUST_ALIGNED(int64_t); > > > > -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 3 } } */ > > -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */ > > +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */ > > +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 1 } } */ > > /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 1 } } */ > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_always.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_always.c > > index > > 9cada57db8947e8ace4ad0bdacc14c80ee0fe9b5..5ffb98a886ecb659bb5c7a5e7ef013cacd14ffb7 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_always.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_always.c > > @@ -24,43 +24,14 @@ TYPE ldp_unaligned_##TYPE(char* ptr){ \ > > return a_0 + a_1; \ > > } > > > > -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \ > > -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \ > > - TYPE a_0, a_1, a_2, a_3, a_4; \ > > - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \ > > - a_0 = arr[100]; \ > > - a_1 = arr[101]; \ > > - a_2 = arr[102]; \ > > - a_3 = arr[103]; \ > > - a_4 = arr[110]; \ > > - return a_0 + a_1 + a_2 + a_3 + a_4; \ > > -} > > - > > -#define LDP_TEST_ADJUST_UNALIGNED(TYPE) \ > > -TYPE ldp_unaligned_adjust_##TYPE(char* ptr){ \ > > - TYPE a_0, a_1, a_2, a_3, a_4; \ > > - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \ > > - TYPE *a = arr+1; \ > > - a_0 = a[100]; \ > > - a_1 = a[101]; \ > > - a_2 = a[102]; \ > > - a_3 = a[103]; \ > > - a_4 = a[110]; \ > > - return a_0 + a_1 + a_2 + a_3 + a_4; \ > > -} > > - > > LDP_TEST_ALIGNED(int32_t); > > LDP_TEST_ALIGNED(int64_t); > > LDP_TEST_ALIGNED(v4si); > > LDP_TEST_UNALIGNED(int32_t); > > LDP_TEST_UNALIGNED(int64_t); > > LDP_TEST_UNALIGNED(v4si); > > -LDP_TEST_ADJUST_ALIGNED(int32_t); > > -LDP_TEST_ADJUST_ALIGNED(int64_t); > > -LDP_TEST_ADJUST_UNALIGNED(int32_t); > > -LDP_TEST_ADJUST_UNALIGNED(int64_t); > > > > -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 6 } } */ > > -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 6 } } */ > > +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 2 } } */ > > +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 2 } } */ > > /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 2 } } */ > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > > index > > 31f392901d2ca9e9e31cb20735fdf86eb040ee88..ac4828af76175388aa0112458476b02064c4e8fc > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > > @@ -1,4 +1,4 @@ > > -/* { dg-options "-O2" } */ > > +/* { dg-options "-O2 -fschedule-insns" } */ > > > > int > > load (int *arr) > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > > index > > 718e82b53f0ccfd09a19afa26ebdb88654359e33..495e199270a60f797a8de21bbe6b8a771f927f23 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > > @@ -1,4 +1,4 @@ > > -/* { dg-options "-O2" } */ > > +/* { dg-options "-O2 -fschedule-insns" } */ > > > > void > > store_offset (int *array, int x, int y) > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c > > deleted file mode 100644 > > index > > 9cc3942f153773e8ffe9bcaf07f6b32dc0d5f95e..0000000000000000000000000000000000000000 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c > > +++ /dev/null > > @@ -1,18 +0,0 @@ > > -/* { dg-do compile } */ > > -/* { dg-options "-O2 -mabi=ilp32" } */ > > - > > -long long > > -load_long (long long int *arr) > > -{ > > - return arr[400] << 1 + arr[401] << 1 + arr[403] << 1 + arr[404] << 1; > > -} > > - > > -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, " 2 } } */ > > - > > -int > > -load (int *arr) > > -{ > > - return arr[527] << 1 + arr[400] << 1 + arr[401] << 1 + arr[528] << 1; > > -} > > - > > -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]+, " 2 } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > > index > > d54c322ce860688de734721718a9c57185d4be63..ac7bc164840ddff765fe599c525aa1d62f217401 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > > @@ -1,4 +1,4 @@ > > -/* { dg-options "-O2" } */ > > +/* { dg-options "-O2 -fschedule-insns" } */ > > > > #pragma GCC target "+nosimd+fp" > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > > index > > b25678323b85046d4a320d534be24aee429274b8..2adf151491b76fbdae8382852feefd810ab3611a > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > > @@ -1,4 +1,4 @@ > > -/* { dg-options "-O2" } */ > > +/* { dg-options "-O2 -fschedule-insns" } */ > > > > typedef float __attribute__ ((vector_size (8))) fvec; > > typedef int __attribute__ ((vector_size (8))) ivec; > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > > index > > fbdae1c6cff1aef40db644361381ce511f0be64a..7a87fe7dd0a4715230733e25acd791dcd082f360 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > > @@ -1,5 +1,5 @@ > > /* { dg-do compile } */ > > -/* { dg-options "-O2" } */ > > +/* { dg-options "-O2 -fschedule-insns" } */ > > > > typedef float __attribute__((vector_size(8))) vec; > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > > b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > > index > > 7714cd6cd9e8fa7dc1febf484d6726d44c246408..068f53e28ce5c5d1e60105a7c2b4001fa96f5153 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > > @@ -1,5 +1,5 @@ > > /* { dg-do compile } */ > > -/* { dg-options "-O2" } */ > > +/* { dg-options "-O2 -fschedule-insns" } */ > > > > typedef int __attribute__((vector_size(8))) vec; > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > > index > > 41ad0bcea00f287757dd510b21915decafbc48c1..14eacce09c0585ec2132cd5dd185626e051ca588 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > > @@ -1,5 +1,5 @@ > > /* { dg-do compile } */ > > -/* { dg-options "-O2" } */ > > +/* { dg-options "-O2 -fschedule-insns" } */ > > > > #include <arm_sve.h> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > > b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > > index > > de650bf39e27b5cdb0f06d04b5d7948b3cc94a54..59dcc0abecf57455bb43ba47a65a2bfd3eae1929 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > > @@ -1,5 +1,5 @@ > > /* { dg-do compile } */ > > -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ > > +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fschedule-insns" } */ > > > > #include <stdint.h> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > > b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > > index > > 28f3826adadd5eaa6486659e4d6b6d7c5960b9d2..0f67458f71856afc54741960e0ac045ad5447395 > > 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > > @@ -17,7 +17,7 @@ double vararg_outgoing (int x1, ...) > > double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = > > x1 * 6; > > __builtin_va_list vl; > > __builtin_va_start (vl, x1); > > - outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1)); > > + outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1), REP8 (1)); > > __builtin_va_end (vl); > > return a1 + a2 + a3 + a4 + a5 + a6; > > }