[gcc r17-835] AArch64: fix the SVE->SIMD lowering optimization [PR125148]

Tamar Christina via Gcc-cvs Wed, 27 May 2026 02:50:59 -0700

https://gcc.gnu.org/g:a6ee91793b9f4d28ccd3fcc6f607f646d305a39e


commit r17-835-ga6ee91793b9f4d28ccd3fcc6f607f646d305a39e
Author: Tamar Christina <[email protected]>
Date:   Wed May 27 10:50:05 2026 +0100

    AArch64: fix the SVE->SIMD lowering optimization [PR125148]
    
    The optimization added in g:210d06502f22964c7214586c54f8eb54a6965bfd has an
    implementation bug which makes it generate bogus code.
    
    The optimization was support to convert SVE loads with a known predicate 
into
    Adv. SIMD loads without the predicate.
    
    The current implementation is done at expansion time where the predicate is
    still clearly available.
    
    It does this by rewriting the loads to an Adv. SIMD load and then taking a
    paradoxical subreg of the result into an SVE vector.
    
    i.e. (subreg:VNx16QI (reg:QI 111) 0)  for a byte load with a VL1 predicate.
    
    The issue is that the SVE loads were UNSPEC before and they didn't get 
optimized
    by passes like forwprop and cse.  Adv. SIMD loads are.
    
    as such in cases where you have such a pattern:
    
    char[] p = {1,2,3,3};
    load (p, VL1)
    
    we used to generate
    
            mov     w0, 1
            strb    w0, [x19]
            ptrue   p7.b, vl1
            ld1b    z30.b, p7/z, [x19]
    
    which was dumb, but valid and the above optimization now gets the load
    eliminated and the constants folded.  However, in particular for scalars,
    AArch64 has an optimization that's been a long for ages in which scalar FPR
    constants are created using vector broadcasting operations.  It assumes 
scalars
    are accessed as scalars (as in, in the mode that created them).
    
    So the above gets optimized to
    
            movi    v30.8b, 0x1
    
    which is invalid.  The original load requires the inactive elements to be 
zero,
    where-as by using the paradoxical subreg it's relying on the implicit (as 
in,
    not modelled in RTL) assumption that the load zeros the top bits, but 
doesn't
    keep in mind that the load can be optimized away.
    
    This patch fixes it by creating a full SVE vector of 0s and writing only the
    values we want to set using an INSR. (i.e. using VL2 of bytes writes a 
short).
    
    It then provides patterns to optimize this:
    
    1. if it's still following a load, just emit the load.
    2. if it's not, then optimize it to a zero'ing operation. so e.g. HI mode
       issues an fmov h0, h0 and so clears the top bits to zero.
    
    I choose this representation because even without the above operations it is
    semantically valid and will generate correct code.
    
    The alternative would be to delay this optimization to e.g. combine however 
we
    have two problems there:
    
    1. It's quite late, so the above constant cases for instance don't get 
optimized
       and we keep the pointless store and loads.
    2. Our RTX costs don't model predicates.  and so it may not accept the
       combination since the replacement is more expensive.
    
    So I chose to keep the optimization early, but just replace the paradoxical
    subreg with a zero-extend.
    
    gcc/ChangeLog:
    
            PR target/125148
            * config/aarch64/aarch64-sve.md
            (*aarch64_vec_shl_insert_into_zero_<mode>,
            *aarch64_vec_shl_insert_into_zero_vnx16qi,
            *aarch64_vec_shl_insert_from_load_<mode>): New.
            * config/aarch64/aarch64.cc (aarch64_emit_load_store_through_mode):
            Replace paradoxical subreg with zero-extend.
    
    gcc/testsuite/ChangeLog:
    
            PR target/125148
            * gcc.target/aarch64/sve/highway_run.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md                  | 45 +++++++++++++++
 gcc/config/aarch64/aarch64.cc                      | 64 ++++++++++++++--------
 gcc/testsuite/gcc.target/aarch64/sve/highway_run.c | 55 +++++++++++++++++++
 3 files changed, 141 insertions(+), 23 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 019630eb8d21..e7d98c3754f1 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3004,6 +3004,51 @@
   [(set_attr "sve_type" "sve_int_general")]
 )
 
+;; Shift an SVE vector left and insert a scalar into element 0 of a zero
+;; register.
+(define_insn "*aarch64_vec_shl_insert_into_zero_<mode>"
+  [(set (match_operand:SVE_FULL_HSD 0 "register_operand")
+       (unspec:SVE_FULL_HSD
+         [(match_operand:SVE_FULL_HSD 1 "aarch64_simd_imm_zero")
+          (match_operand:<VEL> 2 "register_operand")]
+         UNSPEC_INSR))]
+  "TARGET_SVE"
+  {@ [ cons: =0 , 1  , 2 ]
+     [ w        , Dz , w ] fmov\t%<Vetype>0, %<Vetype>2
+  }
+  [(set_attr "type" "neon_move")]
+)
+
+;; Shift an SVE vector left and insert a scalar into element 0 of a zero
+;; register for bytes.
+(define_insn "*aarch64_vec_shl_insert_into_zero_vnx16qi"
+  [(set (match_operand:VNx16QI 0 "register_operand")
+       (unspec:VNx16QI
+         [(match_operand:VNx16QI 1 "aarch64_simd_imm_zero")
+          (match_operand:QI 2 "register_operand")]
+         UNSPEC_INSR))]
+  "TARGET_SVE"
+  {@ [ cons: =0 , 1  , 2 ; attrs: length ]
+     [ w        , Dz , w ; 8              ] fmov\t%h0, %h2\;and\t%0.h, %0.h, 
#0xff
+  }
+  [(set_attr "type" "neon_move")]
+)
+
+;; Shift an SVE vector left and insert a scalar into element 0 from a memory
+;; load.
+(define_insn "*aarch64_vec_shl_insert_from_load_<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand")
+       (unspec:SVE_FULL
+         [(match_operand:SVE_FULL 1 "aarch64_simd_imm_zero")
+          (match_operand:<VEL> 2 "memory_operand")]
+         UNSPEC_INSR))]
+  "TARGET_SVE"
+  {@ [ cons: =0 , 1  , 2 ]
+     [ w        , Dz , m ] ldr\t%<Vetype>0, %2
+  }
+  [(set_attr "type" "neon_move")]
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Linear series
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 7b1dcbdbcfa2..abd6eb5fd1ee 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6887,29 +6887,6 @@ aarch64_stack_protect_canary_mem (machine_mode mode, rtx 
decl_rtl,
   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
 }
 
-/* Emit a load/store from a subreg of SRC to a subreg of DEST.
-   The subregs have mode NEW_MODE. Use only for reg<->mem moves.  */
-void
-aarch64_emit_load_store_through_mode (rtx dest, rtx src, machine_mode new_mode)
-{
-  gcc_assert ((MEM_P (dest) && register_operand (src, VOIDmode))
-             || (MEM_P (src) && register_operand (dest, VOIDmode)));
-  auto mode = GET_MODE (dest);
-  auto int_mode = aarch64_sve_int_mode (mode);
-  if (MEM_P (src))
-    {
-      rtx tmp = force_reg (new_mode, adjust_address (src, new_mode, 0));
-      tmp = force_lowpart_subreg (int_mode, tmp, new_mode);
-      emit_move_insn (dest, force_lowpart_subreg (mode, tmp, int_mode));
-    }
-  else
-    {
-      src = force_lowpart_subreg (int_mode, src, mode);
-      emit_move_insn (adjust_address (dest, new_mode, 0),
-                     force_lowpart_subreg (new_mode, src, int_mode));
-    }
-}
-
 /* PRED is a predicate that is known to contain PTRUE.
    For 128-bit VLS loads/stores, emit LDR/STR.
    Else, emit an SVE predicated move from SRC to DEST.  */
@@ -26312,6 +26289,47 @@ aarch64_sve_expand_vector_init_subvector (rtx target, 
rtx vals)
   return;
 }
 
+/* Emit a load/store from a subreg of SRC to a subreg of DEST.
+   The subregs have mode NEW_MODE. Use only for reg<->mem moves.  */
+void
+aarch64_emit_load_store_through_mode (rtx dest, rtx src, machine_mode new_mode)
+{
+  gcc_assert ((MEM_P (dest) && register_operand (src, VOIDmode))
+             || (MEM_P (src) && register_operand (dest, VOIDmode)));
+  auto mode = GET_MODE (dest);
+  auto int_mode = aarch64_sve_int_mode (mode);
+  rtx tmp_reg;
+  if (MEM_P (src))
+    {
+      rtx tmp = force_reg (new_mode, adjust_address (src, new_mode, 0));
+      if (!VECTOR_MODE_P (new_mode))
+       {
+         machine_mode full_mode = int_mode;
+         auto vmode = aarch64_classify_vector_mode (int_mode);
+         /* Partial vectors have to go through a full mode insert since we
+            don't support inserting an partial vectors.  */
+         if (GET_MODE_INNER (int_mode) != new_mode || (vmode & VEC_PARTIAL))
+           full_mode
+             = aarch64_full_sve_mode (as_a <scalar_mode> (new_mode)).require 
();
+
+         /* Create an SVE register with the top bits explicitly zero'd.  */
+         tmp_reg = force_reg (full_mode, CONST0_RTX (full_mode));
+         emit_insr (tmp_reg, tmp);
+         if (full_mode != int_mode)
+           tmp_reg = force_lowpart_subreg (int_mode, tmp_reg, full_mode);
+       }
+      else
+       tmp_reg = force_lowpart_subreg (int_mode, tmp, new_mode);
+      emit_move_insn (dest, force_lowpart_subreg (mode, tmp_reg, int_mode));
+    }
+  else
+    {
+      src = force_lowpart_subreg (int_mode, src, mode);
+      emit_move_insn (adjust_address (dest, new_mode, 0),
+                     force_lowpart_subreg (new_mode, src, int_mode));
+    }
+}
+
 /* Check whether VALUE is a vector constant in which every element
    is either a power of 2 or a negated power of 2.  If so, return
    a constant vector of log2s, and flip CODE between PLUS and MINUS
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/highway_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/highway_run.c
new file mode 100644
index 000000000000..b73fd51c63be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/highway_run.c
@@ -0,0 +1,55 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+extern void abort (void) __attribute__ ((noreturn));
+
+volatile int cond = 1;
+
+int __attribute__ ((noipa))
+a (void)
+{
+  return cond;
+}
+
+#define TEST_LOAD(TYPE, NAME, BITS)                                    \
+  int __attribute__ ((noipa))                                          \
+  test_##NAME (void)                                                   \
+  {                                                                    \
+    TYPE *g = __builtin_malloc (sizeof (TYPE));                                
\
+    int c = 0;                                                         \
+    if (!g)                                                            \
+      abort ();                                                                
\
+    g[0] = 0;                                                          \
+    if (a ())                                                          \
+      {                                                                        
\
+       g[0] = 1;                                                       \
+       c = 2;                                                          \
+      }                                                                        
\
+    svint##BITS##_t d                                          \
+      = svld1_s##BITS (svptrue_pat_b##BITS (SV_VL1), g);               \
+    svbool_t e = svcmpgt_s##BITS (svptrue_b##BITS (), d,               \
+                                 svdup_n_s##BITS (0));         \
+    int f = svptest_any (svptrue_pat_b##BITS (SV_VL1), e);             \
+    if (f && c != 2)                                                   \
+      abort ();                                                                
\
+    return f;                                                          \
+  }
+
+TEST_LOAD (signed char, byte, 8)
+TEST_LOAD (short, short, 16)
+TEST_LOAD (int, int, 32)
+TEST_LOAD (long, long, 64)
+
+int
+main (void)
+{
+  if (!test_byte ()
+      || !test_short ()
+      || !test_int ()
+      || !test_long ())
+    abort ();
+  return 0;
+}

[gcc r17-835] AArch64: fix the SVE->SIMD lowering optimization [PR125148]

Reply via email to