[PATCH] AArch64: Switch off early scheduling

Wilco Dijkstra Thu, 31 Oct 2024 09:45:55 -0700

The early scheduler takes up ~33% of the total build time, however it doesn't
provide a meaningful performance gain.  This is partly because modern OoO cores
need far less scheduling, partly because the scheduler tends to create many
unnecessary spills by increasing register pressure.  Building applications
56% faster is far more useful than ~0.1% improvement on SPEC, so switch off
early scheduling on AArch64.  Codesize reduces by ~0.2%.


The combine_and_move pass runs if the scheduler is disabled and aggressively
combines moves.  The movsf/df patterns allow all FP immediates since they
rely on a split pattern, however splits do not happen this late.  To fix this,
use a more accurate check that blocks creation of literal loads during
combine_and_move.  Fix various tests that depend on scheduling by explicitly
adding -fschedule-insns.

Passes bootstrap & regress, OK for commit?

gcc/ChangeLog:
        * common/config/aarch64/aarch64-common.cc: Switch off fschedule_insns.
        * config/aarch64/aarch64.md (movhf_aarch64): Use aarch64_valid_fp_move.
        (movsf_aarch64): Likewise.
        (movdf_aarch64): Likewise.
        * config/aarch64/aarch64.cc (aarch64_valid_fp_move): New function.
        * config/aarch64/aarch64-protos.h (aarch64_valid_fp_move): Likewise.

gcc/testsuite/ChangeLog:
        * testsuite/gcc.target/aarch64/ldp_aligned.c: Fix test.
        * testsuite/gcc.target/aarch64/ldp_always.c: Likewise.
        * testsuite/gcc.target/aarch64/ldp_stp_10.c: Add -fschedule-insns.
        * testsuite/gcc.target/aarch64/ldp_stp_12.c: Likewise.
        * testsuite/gcc.target/aarch64/ldp_stp_13.c: Remove test.
        * testsuite/gcc.target/aarch64/ldp_stp_21.c: Add -fschedule-insns.
        * testsuite/gcc.target/aarch64/ldp_stp_8.c: Likewise.
        * testsuite/gcc.target/aarch64/ldp_vec_v2sf.c: Likewise.
        * testsuite/gcc.target/aarch64/ldp_vec_v2si.c: Likewise.
        * testsuite/gcc.target/aarch64/test_frame_16.c: Fix test.
        * testsuite/gcc.target/aarch64/sve/vcond_12.c: Add -fschedule-insns.
        * testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c: Likewise.

---

diff --git a/gcc/common/config/aarch64/aarch64-common.cc 
b/gcc/common/config/aarch64/aarch64-common.cc
index 
2bfc597e333b6018970a9ee6e370a66b6d0960ef..845747e31e821c2f3970fd39ea70f046eddbe920
 100644
--- a/gcc/common/config/aarch64/aarch64-common.cc
+++ b/gcc/common/config/aarch64/aarch64-common.cc
@@ -54,6 +54,8 @@ static const struct default_options 
aarch_option_optimization_table[] =
     { OPT_LEVELS_ALL, OPT_fomit_frame_pointer, NULL, 0 },
     /* Enable -fsched-pressure by default when optimizing.  */
     { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
+    /* Disable early scheduling due to high compile-time overheads.  */
+    { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
     /* Enable redundant extension instructions removal at -O2 and higher.  */
     { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL },
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
250c5b96a21ea1c969a0e77e420525eec90e4de4..b30329d7f85f5b962dca43cf12ca938898425874
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -758,6 +758,7 @@ bool aarch64_advsimd_struct_mode_p (machine_mode mode);
 opt_machine_mode aarch64_vq_mode (scalar_mode);
 opt_machine_mode aarch64_full_sve_mode (scalar_mode);
 bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
+bool aarch64_valid_fp_move (rtx, rtx, machine_mode);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
                                            HOST_WIDE_INT);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
2647293f7cf020378dacc37b7bfbccc856573e44..965ec18412a6486e6ac4ff2e4a7d742bf61e5d75
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -11223,6 +11223,36 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
   return aarch64_simd_valid_mov_imm (v_op);
 }
 
+/* Return TRUE if DST and SRC with mode MODE is a valid fp move.  */
+bool
+aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode)
+{
+  if (!TARGET_FLOAT)
+    return false;
+
+  if (aarch64_reg_or_fp_zero (src, mode))
+    return true;
+
+  if (!register_operand (dst, mode))
+    return false;
+
+  if (MEM_P (src))
+    return true;
+
+  if (!DECIMAL_FLOAT_MODE_P (mode))
+    {
+      if (aarch64_can_const_movi_rtx_p (src, mode)
+         || aarch64_float_const_representable_p (src)
+         || aarch64_float_const_zero_rtx_p (src))
+       return true;
+
+      /* This requires a split which is only allowed before regalloc.  */
+      if (aarch64_float_const_rtx_p (src))
+       return can_create_pseudo_p () && !ira_in_progress;
+    }
+
+  return can_create_pseudo_p ();
+}
 
 /* Return the fixed registers used for condition codes.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
20956fc49d8232763b127629ded17037ad7d7960..5d3fa9628952031f52474291e160b957d774b011
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1644,8 +1644,7 @@ (define_expand "mov<mode>"
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:HFBF 0 "nonimmediate_operand")
        (match_operand:HFBF 1 "general_operand"))]
-  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
-    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
+  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
   {@ [ cons: =0 , 1   ; attrs: type , arch  ]
      [ w        , Y   ; neon_move   , simd  ] movi\t%0.4h, #0
      [ w        , ?rY ; f_mcr       , fp16  ] fmov\t%h0, %w1
@@ -1668,8 +1667,7 @@ (define_insn "*mov<mode>_aarch64"
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:SFD 0 "nonimmediate_operand")
        (match_operand:SFD 1 "general_operand"))]
-  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
-    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
+  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
   {@ [ cons: =0 , 1   ; attrs: type , arch  ]
      [ w        , Y   ; neon_move   , simd  ] movi\t%0.2s, #0
      [ w        , ?rY ; f_mcr       , *     ] fmov\t%s0, %w1
@@ -1689,8 +1687,7 @@ (define_insn "*mov<mode>_aarch64"
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:DFD 0 "nonimmediate_operand")
        (match_operand:DFD 1 "general_operand"))]
-  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
-    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
+  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
   {@ [ cons: =0 , 1   ; attrs: type , arch  ]
      [ w        , Y   ; neon_move   , simd  ] movi\t%d0, #0
      [ w        , ?rY ; f_mcr       , *     ] fmov\t%d0, %x1
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
index 
75495d71df28235b2bb2dc634c3e5121d398bac2..8ec2b0392b80d4c0d8b47a512ba291e3bade3be3
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
@@ -14,25 +14,11 @@ TYPE ldp_aligned_##TYPE(char* ptr){ \
     return a_0 + a_1; \
 }
 
-#define LDP_TEST_ADJUST_ALIGNED(TYPE) \
-TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \
-    TYPE a_0, a_1, a_2, a_3, a_4; \
-    TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
-    a_0 = arr[100]; \
-    a_1 = arr[101]; \
-    a_2 = arr[102]; \
-    a_3 = arr[103]; \
-    a_4 = arr[110]; \
-    return a_0 + a_1 + a_2 + a_3 + a_4; \
-}
-
 LDP_TEST_ALIGNED(int32_t);
 LDP_TEST_ALIGNED(int64_t);
 LDP_TEST_ALIGNED(v4si);
-LDP_TEST_ADJUST_ALIGNED(int32_t);
-LDP_TEST_ADJUST_ALIGNED(int64_t);
 
-/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 3 } } */
-/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */
+/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 1 } } */
 /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 1 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_always.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_always.c
index 
9cada57db8947e8ace4ad0bdacc14c80ee0fe9b5..5ffb98a886ecb659bb5c7a5e7ef013cacd14ffb7
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_always.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_always.c
@@ -24,43 +24,14 @@ TYPE ldp_unaligned_##TYPE(char* ptr){ \
     return a_0 + a_1; \
 }
 
-#define LDP_TEST_ADJUST_ALIGNED(TYPE) \
-TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \
-    TYPE a_0, a_1, a_2, a_3, a_4; \
-    TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
-    a_0 = arr[100]; \
-    a_1 = arr[101]; \
-    a_2 = arr[102]; \
-    a_3 = arr[103]; \
-    a_4 = arr[110]; \
-    return a_0 + a_1 + a_2 + a_3 + a_4; \
-}
-
-#define LDP_TEST_ADJUST_UNALIGNED(TYPE) \
-TYPE ldp_unaligned_adjust_##TYPE(char* ptr){ \
-    TYPE a_0, a_1, a_2, a_3, a_4; \
-    TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
-    TYPE *a = arr+1; \
-    a_0 = a[100]; \
-    a_1 = a[101]; \
-    a_2 = a[102]; \
-    a_3 = a[103]; \
-    a_4 = a[110]; \
-    return a_0 + a_1 + a_2 + a_3 + a_4; \
-}
-
 LDP_TEST_ALIGNED(int32_t);
 LDP_TEST_ALIGNED(int64_t);
 LDP_TEST_ALIGNED(v4si);
 LDP_TEST_UNALIGNED(int32_t);
 LDP_TEST_UNALIGNED(int64_t);
 LDP_TEST_UNALIGNED(v4si);
-LDP_TEST_ADJUST_ALIGNED(int32_t);
-LDP_TEST_ADJUST_ALIGNED(int64_t);
-LDP_TEST_ADJUST_UNALIGNED(int32_t);
-LDP_TEST_ADJUST_UNALIGNED(int64_t);
 
-/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 6 } } */
-/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 6 } } */
+/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 2 } } */
 /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 2 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
index 
31f392901d2ca9e9e31cb20735fdf86eb040ee88..ac4828af76175388aa0112458476b02064c4e8fc
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -fschedule-insns" } */
 
 int
 load (int *arr)
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
index 
718e82b53f0ccfd09a19afa26ebdb88654359e33..495e199270a60f797a8de21bbe6b8a771f927f23
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -fschedule-insns" } */
 
 void
 store_offset (int *array, int x, int y)
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c
deleted file mode 100644
index 
9cc3942f153773e8ffe9bcaf07f6b32dc0d5f95e..0000000000000000000000000000000000000000
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O2 -mabi=ilp32" } */
-
-long long
-load_long (long long int *arr)
-{
-  return arr[400] << 1 + arr[401] << 1 + arr[403] << 1 + arr[404] << 1;
-}
-
-/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, " 2 } } */
-
-int
-load (int *arr)
-{
-  return arr[527] << 1 + arr[400] << 1 + arr[401] << 1 + arr[528] << 1;
-}
-
-/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]+, " 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
index 
d54c322ce860688de734721718a9c57185d4be63..ac7bc164840ddff765fe599c525aa1d62f217401
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -fschedule-insns" } */
 
 #pragma GCC target "+nosimd+fp"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
index 
b25678323b85046d4a320d534be24aee429274b8..2adf151491b76fbdae8382852feefd810ab3611a
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -fschedule-insns" } */
 
 typedef float __attribute__ ((vector_size (8))) fvec;
 typedef int __attribute__ ((vector_size (8))) ivec;
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
index 
fbdae1c6cff1aef40db644361381ce511f0be64a..7a87fe7dd0a4715230733e25acd791dcd082f360
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -fschedule-insns" } */
 
 typedef float __attribute__((vector_size(8))) vec;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
index 
7714cd6cd9e8fa7dc1febf484d6726d44c246408..068f53e28ce5c5d1e60105a7c2b4001fa96f5153
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -fschedule-insns" } */
 
 typedef int __attribute__((vector_size(8))) vec;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
index 
41ad0bcea00f287757dd510b21915decafbc48c1..14eacce09c0585ec2132cd5dd185626e051ca588
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -fschedule-insns" } */
 
 #include <arm_sve.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c 
b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
index 
de650bf39e27b5cdb0f06d04b5d7948b3cc94a54..59dcc0abecf57455bb43ba47a65a2bfd3eae1929
 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fschedule-insns" } */
 
 #include <stdint.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c 
b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
index 
28f3826adadd5eaa6486659e4d6b6d7c5960b9d2..0f67458f71856afc54741960e0ac045ad5447395
 100644
--- a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
@@ -17,7 +17,7 @@ double vararg_outgoing (int x1, ...)
   double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = x1 
* 6;
   __builtin_va_list vl;
   __builtin_va_start (vl, x1);
-  outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
+  outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1), REP8 (1));
   __builtin_va_end (vl);
   return a1 + a2 + a3 + a4 + a5 + a6;
 }

[PATCH] AArch64: Switch off early scheduling

Reply via email to