SVE loads/stores using predicates that select the bottom 8, 16, 32, 64,
or 128 bits of a register can be folded to ASIMD LDR/STR, thus avoiding the
predicate.
For example,
svuint8_t foo (uint8_t *x) {
  return svld1 (svwhilelt_b8 (0, 16), x);
}
was previously compiled to:
foo:
        ptrue   p3.b, vl16
        ld1b    z0.b, p3/z, [x0]
        ret

and is now compiled to:
foo:
        ldr     q0, [x0]
        ret

The optimization is applied during the expand pass and was implemented
by making the following changes to maskload<mode><vpred> and
maskstore<mode><vpred>:
- the existing define_insns were renamed and new define_expands for maskloads
  and maskstores were added with predicates for the SVE predicate that match
  both register operands and constant-vector operands.
- if the SVE predicate is a constant vector and contains a pattern as
  described above, an ASIMD load/store is emitted instead of the SVE load/store.

The patch implements the optimization for LD1 and ST1, for 8-bit, 16-bit,
32-bit, 64-bit, and 128-bit moves, for all full SVE data vector modes.
Note that VNx8HFmode and VNx2BFmode with a VL2 pattern were excluded, because
there are no move patterns for V2HFmode and V2BFmode (yet).

Follow-up patches for LD2/3/4 and ST2/3/4 and potentially partial SVE vector
modes are planned.

The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>

gcc/
        PR target/117978
        * config/aarch64/aarch64-protos.h: Declare
        aarch64_simd_container_mode, aarch64_sve_full_data_mode_p,
        aarch64_count_pred_pat_128, aarch64_emit_load_store_through_mode.
        * config/aarch64/aarch64-sve.md
        (maskload<mode><vpred>): New define_expand folding maskloads with
        certain predicate patterns to ASIMD loads.
        (*aarch64_maskload<mode><vpred>): Renamed from maskload<mode><vpred>.
        (maskstore<mode><vpred>): New define_expand folding maskstores with
        certain predicate patterns to ASIMD stores.
        (*aarch64_maskstore<mode><vpred>): Renamed from maskstore<mode><vpred>.
        * config/aarch64/aarch64.cc
        (aarch64_sve_full_data_mode_p): New function returning true if a given
        mode is a full SVE data vector mode.
        (aarch64_emit_load_store_through_mode): New function emitting a
        load/store through subregs of a given mode.
        (aarch64_emit_sve_pred_move): Refactor to use
        aarch64_emit_load_store_through_mode.
        (aarch64_v8_mode): New function returning an 8-bit mode.
        (aarch64_v16_mode): New function returning a 16-bit mode.
        (aarch64_v32_mode): New function returning a 32-bit mode.
        (aarch64_simd_container_mode): Make public and extend to find
        8-bit, 16-bit, and 32-bit container modes.
        (aarch64_count_pred_pat_128): New function to find SVE predicates
        with VL1, VL2, VL4, VL8, or VL16 patterns.
        * config/aarch64/iterators.md (elem_bits): Extend to cover partial
        SVE vector modes.
        * config/aarch64/predicates.md (aarch64_sve_reg_or_const_pred): New
        predicate matching register operands or constant-vector operands.

gcc/testsuite/
        PR target/117978
        * gcc.target/aarch64/sve/acle/general/whilelt_5.c: Adjust expected
        outcome.
        * gcc.target/aarch64/sve/ldst_ptrue_pat_128_to_neon.c: New test.
        * gcc.target/aarch64/sve/while_7.c: Adjust expected outcome.
        * gcc.target/aarch64/sve/while_9.c: Adjust expected outcome.
---
 gcc/config/aarch64/aarch64-protos.h           |   4 +
 gcc/config/aarch64/aarch64-sve.md             |  62 ++++++++-
 gcc/config/aarch64/aarch64.cc                 | 128 +++++++++++++++---
 gcc/config/aarch64/iterators.md               |  19 ++-
 gcc/config/aarch64/predicates.md              |   4 +
 .../aarch64/sve/acle/general/whilelt_5.c      |  24 +++-
 .../aarch64/sve/ldst_ptrue_pat_128_to_neon.c  |  81 +++++++++++
 .../gcc.target/aarch64/sve/while_7.c          |   4 +-
 .../gcc.target/aarch64/sve/while_9.c          |   2 +-
 9 files changed, 296 insertions(+), 32 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_pat_128_to_neon.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 1ca86c9d175..a03f091fe3a 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -857,6 +857,7 @@ enum aarch64_symbol_type 
aarch64_classify_symbolic_expression (rtx);
 bool aarch64_advsimd_struct_mode_p (machine_mode mode);
 opt_machine_mode aarch64_v64_mode (scalar_mode);
 opt_machine_mode aarch64_v128_mode (scalar_mode);
+machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 opt_machine_mode aarch64_full_sve_mode (scalar_mode);
 bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_valid_fp_move (rtx, rtx, machine_mode);
@@ -903,8 +904,10 @@ opt_machine_mode aarch64_advsimd_vector_array_mode 
(machine_mode,
                                                    unsigned HOST_WIDE_INT);
 opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
 bool aarch64_sve_mode_p (machine_mode);
+bool aarch64_sve_full_data_mode_p (machine_mode);
 HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
 bool aarch64_sve_cnt_immediate_p (rtx);
+int aarch64_count_pred_pat_128 (rtx, machine_mode);
 bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
 bool aarch64_sve_rdvl_immediate_p (rtx);
 bool aarch64_sve_addvl_addpl_immediate_p (rtx);
@@ -1026,6 +1029,7 @@ rtx aarch64_ptrue_reg (machine_mode, unsigned int);
 rtx aarch64_ptrue_reg (machine_mode, machine_mode);
 rtx aarch64_pfalse_reg (machine_mode);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
+void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
 void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode);
 bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index d4af3706294..d9392e3611a 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1286,7 +1286,36 @@
 ;; -------------------------------------------------------------------------
 
 ;; Predicated LD1 (single).
-(define_insn "maskload<mode><vpred>"
+(define_expand "maskload<mode><vpred>"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+       (unspec:SVE_ALL
+         [(match_operand:<VPRED> 2 "aarch64_sve_reg_or_const_pred")
+          (match_operand:SVE_ALL 1 "memory_operand")
+          (match_operand:SVE_ALL 3 "aarch64_maskload_else_operand")]
+         UNSPEC_LD1_SVE))]
+  "TARGET_SVE"
+  {
+    int pat_cnt = aarch64_count_pred_pat_128 (operands[2], <MODE>mode);
+    int width = <elem_bits> * pat_cnt;
+    if (aarch64_sve_full_data_mode_p (<MODE>mode)
+       && pat_cnt && (pat_cnt == 1 || !BYTES_BIG_ENDIAN)
+       && known_le (width, 128))
+      {
+       machine_mode mode = aarch64_simd_container_mode (<VEL>mode, width);
+       if (mode != VOIDmode)
+       {
+         aarch64_emit_load_store_through_mode (operands[0],
+                                               operands[1], mode);
+         DONE;
+       }
+      }
+    if (!REG_P (operands[2]))
+      operands[2] = force_reg (<VPRED>mode, operands[2]);
+  }
+)
+
+;; Predicated LD1 (single).
+(define_insn "*aarch64_maskload<mode><vpred>"
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
        (unspec:SVE_ALL
          [(match_operand:<VPRED> 2 "register_operand" "Upl")
@@ -2287,7 +2316,36 @@
 ;; -------------------------------------------------------------------------
 
 ;; Predicated ST1 (single).
-(define_insn "maskstore<mode><vpred>"
+(define_expand "maskstore<mode><vpred>"
+  [(set (match_operand:SVE_ALL 0 "memory_operand")
+       (unspec:SVE_ALL
+         [(match_operand:<VPRED> 2 "aarch64_sve_reg_or_const_pred")
+          (match_operand:SVE_ALL 1 "register_operand")
+          (match_dup 0)]
+         UNSPEC_ST1_SVE))]
+  "TARGET_SVE"
+  {
+    int pat_cnt = aarch64_count_pred_pat_128 (operands[2], <MODE>mode);
+    int width = <elem_bits> * pat_cnt;
+    if (aarch64_sve_full_data_mode_p (<MODE>mode)
+       && pat_cnt && (pat_cnt == 1 || !BYTES_BIG_ENDIAN)
+       && known_le (width, 128))
+      {
+       machine_mode mode = aarch64_simd_container_mode (<VEL>mode, width);
+       if (mode != VOIDmode)
+       {
+         aarch64_emit_load_store_through_mode (operands[0],
+                                               operands[1], mode);
+         DONE;
+       }
+      }
+    if (!REG_P (operands[2]))
+      operands[2] = force_reg (<VPRED>mode, operands[2]);
+  }
+)
+
+;; Predicated ST1 (single).
+(define_insn "*aarch64_maskstore<mode><vpred>"
   [(set (match_operand:SVE_ALL 0 "memory_operand" "+m")
        (unspec:SVE_ALL
          [(match_operand:<VPRED> 2 "register_operand" "Upl")
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index fff8d9da49d..720c00980d8 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -352,7 +352,6 @@ static bool aarch64_builtin_support_vector_misalignment 
(machine_mode mode,
                                                         const_tree type,
                                                         int misalignment,
                                                         bool is_packed);
-static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
                                            aarch64_addr_query_type);
 
@@ -1765,6 +1764,14 @@ aarch64_sve_data_mode_p (machine_mode mode)
   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
 }
 
+/* Return true if MODE is an SVE data vector mode, but not a partial mode;
+   either a single vector or a structure of vectors.  */
+bool
+aarch64_sve_full_data_mode_p (machine_mode mode)
+{
+  return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA;
+}
+
 /* Return the number of defined bytes in one constituent vector of
    SVE mode MODE, which has vector flags VEC_FLAGS.  */
 static poly_int64
@@ -6410,8 +6417,27 @@ aarch64_stack_protect_canary_mem (machine_mode mode, rtx 
decl_rtl,
   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
 }
 
-/* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
-   that is known to contain PTRUE.  */
+/* Emit a load/store from a subreg of SRC to a subreg of DEST.
+   The subregs have mode NEW_MODE. Use only for reg<->mem moves.  */
+void
+aarch64_emit_load_store_through_mode (rtx dest, rtx src, machine_mode new_mode)
+{
+  gcc_assert ((REG_P (src) && MEM_P (dest))
+             || (REG_P (dest) && MEM_P (src)));
+  machine_mode mode = GET_MODE (dest);
+  if (MEM_P (src))
+    {
+      rtx tmp = force_reg (new_mode, adjust_address (src, new_mode, 0));
+      emit_move_insn (dest, lowpart_subreg (mode, tmp, new_mode));
+    }
+  else
+    emit_move_insn (adjust_address (dest, new_mode, 0),
+                   force_lowpart_subreg (new_mode, src, mode));
+}
+
+/* PRED is a predicate that is known to contain PTRUE.
+   For 128-bit VLS loads/stores, emit LDR/STR.
+   Else, emit an SVE predicated move from SRC to DEST.  */
 
 void
 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
@@ -6421,16 +6447,7 @@ aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
       && known_eq (GET_MODE_SIZE (mode), 16)
       && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
       && !BYTES_BIG_ENDIAN)
-    {
-      if (MEM_P (src))
-       {
-         rtx tmp = force_reg (V16QImode, adjust_address (src, V16QImode, 0));
-         emit_move_insn (dest, lowpart_subreg (mode, tmp, V16QImode));
-       }
-      else
-       emit_move_insn (adjust_address (dest, V16QImode, 0),
-                       force_lowpart_subreg (V16QImode, src, mode));
-    }
+    aarch64_emit_load_store_through_mode (dest, src, V16QImode);
   else
     {
       expand_operand ops[3];
@@ -22519,6 +22536,57 @@ aarch64_full_sve_mode (scalar_mode mode)
     }
 }
 
+/* Return the 8-bit mode for element mode MODE, if it exists.  */
+opt_machine_mode
+aarch64_v8_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_QImode:
+      return E_QImode;
+    default:
+      return {};
+    }
+}
+
+/* Return the 16-bit mode for element mode MODE, if it exists.  */
+opt_machine_mode
+aarch64_v16_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_HFmode:
+      return E_HFmode;
+    case E_BFmode:
+      return E_BFmode;
+    case E_HImode:
+      return E_HImode;
+    case E_QImode:
+      return E_HImode;
+    default:
+      return {};
+    }
+}
+
+/* Return the 32-bit mode for element mode MODE, if it exists.  */
+opt_machine_mode
+aarch64_v32_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_SFmode:
+      return E_SFmode;
+    case E_SImode:
+      return E_SImode;
+    case E_HImode:
+      return E_SImode;
+    case E_QImode:
+      return E_SImode;
+    default:
+      return {};
+    }
+}
+
 /* Return the 64-bit Advanced SIMD vector mode for element mode MODE,
    if it exists.  */
 opt_machine_mode
@@ -22573,7 +22641,7 @@ aarch64_v128_mode (scalar_mode mode)
 
 /* Return appropriate SIMD container
    for MODE within a vector of WIDTH bits.  */
-static machine_mode
+machine_mode
 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 {
   if (TARGET_SVE
@@ -22581,13 +22649,21 @@ aarch64_simd_container_mode (scalar_mode mode, 
poly_int64 width)
       && known_eq (width, BITS_PER_SVE_VECTOR))
     return aarch64_full_sve_mode (mode).else_mode (word_mode);
 
-  gcc_assert (known_eq (width, 64) || known_eq (width, 128));
+  gcc_assert (known_eq (width, 64) || known_eq (width, 128)
+             || known_eq (width, 32) || known_eq (width, 16)
+             || known_eq (width, 8));
   if (TARGET_BASE_SIMD)
     {
       if (known_eq (width, 128))
        return aarch64_v128_mode (mode).else_mode (word_mode);
-      else
+      else if (known_eq (width, 64))
        return aarch64_v64_mode (mode).else_mode (word_mode);
+      else if (known_eq (width, 32))
+       return aarch64_v32_mode (mode).else_mode (VOIDmode);
+      else if (known_eq (width, 16))
+       return aarch64_v16_mode (mode).else_mode (VOIDmode);
+      else
+       return aarch64_v8_mode (mode).else_mode (VOIDmode);
     }
   return word_mode;
 }
@@ -23526,6 +23602,26 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info 
*info,
   return false;
 }
 
+/* If PRED is a patterned SVE PTRUE predicate with patterns
+   VL1, VL2, VL4, VL8, or VL16, return the number of active lanes
+   for the mode MODE. Else return 0.  */
+int
+aarch64_count_pred_pat_128 (rtx pred, machine_mode mode)
+{
+  struct simd_immediate_info info;
+  bool is_valid;
+  is_valid = aarch64_simd_valid_imm (pred, &info, AARCH64_CHECK_MOV);
+  if (!is_valid || info.insn != simd_immediate_info::PTRUE)
+    return 0;
+  aarch64_svpattern pattern = info.u.pattern;
+  unsigned int cnt
+    = aarch64_fold_sve_cnt_pat (pattern, 128 / GET_MODE_UNIT_BITSIZE (mode));
+  if (pattern <= AARCH64_SV_VL16 && pow2p_hwi (cnt))
+    return cnt;
+  else
+    return 0;
+}
+
 /* Return true if OP is a valid SIMD move immediate for SVE or AdvSIMD.  */
 bool
 aarch64_simd_valid_mov_imm (rtx op)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 146453b0516..e7db193383b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1384,12 +1384,19 @@
 ;; element.
 (define_mode_attr elem_bits [(VNx16BI "8") (VNx8BI "16")
                             (VNx4BI "32") (VNx2BI "64")
-                            (VNx16QI "8") (VNx32QI "8") (VNx64QI "8")
-                            (VNx8HI "16") (VNx16HI "16") (VNx32HI "16")
-                            (VNx8HF "16") (VNx16HF "16") (VNx32HF "16")
-                            (VNx8BF "16") (VNx16BF "16") (VNx32BF "16")
-                            (VNx4SI "32") (VNx8SI "32") (VNx16SI "32")
-                            (VNx4SF "32") (VNx8SF "32") (VNx16SF "32")
+                            (VNx2QI "8") (VNx4QI "8")
+                            (VNx8QI "8") (VNx16QI "8")
+                            (VNx32QI "8") (VNx64QI "8")
+                            (VNx2HI "16") (VNx4HI "16") (VNx8HI "16")
+                            (VNx16HI "16") (VNx32HI "16")
+                            (VNx2HF "16") (VNx4HF "16") (VNx8HF "16")
+                            (VNx16HF "16") (VNx32HF "16")
+                            (VNx2BF "16") (VNx4BF "16") (VNx8BF "16")
+                            (VNx16BF "16") (VNx32BF "16")
+                            (VNx2SI "32") (VNx4SI "32")
+                            (VNx8SI "32") (VNx16SI "32")
+                            (VNx2SF "32") (VNx4SF "32")
+                            (VNx8SF "32") (VNx16SF "32")
                             (VNx2DI "64") (VNx4DI "64") (VNx8DI "64")
                             (VNx2DF "64") (VNx4DF "64") (VNx8DF "64")
                             (VNx1TI "128")])
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 1ab1c696c62..1eeda58a1e5 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -813,6 +813,10 @@
   (and (match_code "const")
        (match_test "aarch64_sve_ptrue_svpattern_p (op, NULL)")))
 
+(define_predicate "aarch64_sve_reg_or_const_pred"
+  (ior (match_operand 0 "register_operand")
+       (match_code "const_vector")))
+
 (define_predicate "aarch64_sve_arith_immediate"
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_arith_immediate_p (mode, op, false)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c
index f06a74aa2da..05e266aad7d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c
@@ -11,8 +11,7 @@ extern "C" {
 
 /*
 ** load_vl1:
-**     ptrue   (p[0-7])\.[bhsd], vl1
-**     ld1h    z0\.h, \1/z, \[x0\]
+**     ldr     h0, \[x0\]
 **     ret
 */
 svint16_t
@@ -22,7 +21,12 @@ load_vl1 (int16_t *ptr)
 }
 
 /*
-** load_vl2:
+** load_vl2: { target aarch64_little_endian }
+**     ldr     s0, \[x0\]
+**     ret
+*/
+/*
+** load_vl2: { target aarch64_big_endian }
 **     ptrue   (p[0-7])\.h, vl2
 **     ld1h    z0\.h, \1/z, \[x0\]
 **     ret
@@ -46,7 +50,12 @@ load_vl3 (int16_t *ptr)
 }
 
 /*
-** load_vl4:
+** load_vl4: { target aarch64_little_endian }
+**     ldr     d0, \[x0\]
+**     ret
+*/
+/*
+** load_vl4: { target aarch64_big_endian }
 **     ptrue   (p[0-7])\.h, vl4
 **     ld1h    z0\.h, \1/z, \[x0\]
 **     ret
@@ -94,7 +103,12 @@ load_vl7 (int16_t *ptr)
 }
 
 /*
-** load_vl8:
+** load_vl8: { target aarch64_little_endian }
+**     ldr     q0, \[x0\]
+**     ret
+*/
+/*
+** load_vl8: { target aarch64_big_endian }
 **     ptrue   (p[0-7])\.h, vl8
 **     ld1h    z0\.h, \1/z, \[x0\]
 **     ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_pat_128_to_neon.c 
b/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_pat_128_to_neon.c
new file mode 100644
index 00000000000..855514a34e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_pat_128_to_neon.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#include <arm_sve.h>
+
+#define TEST(TYPE, TY, W, B)                                           \
+  sv##TYPE                                                             \
+  ld1_##TY##W##B##_1 (TYPE *x)                                         \
+  {                                                                    \
+    svbool_t pg = svwhilelt_b##B (0, W);                               \
+    return svld1_##TY##B (pg, x);                                      \
+  }                                                                    \
+  sv##TYPE                                                             \
+  ld1_##TY##W##B##_2 (TYPE *x)                                         \
+  {                                                                    \
+    svbool_t pg = svptrue_pat_b##B ((enum svpattern) (W > 8 ? 9 : W)); \
+    return svld1_##TY##B (pg, x);                                      \
+  }                                                                    \
+  void                                                                 \
+  st1_##TY##W##B##_1 (TYPE *x, sv##TYPE data)                                  
        \
+  {                                                                    \
+    svbool_t pg = svwhilelt_b##B (0, W);                               \
+    return svst1_##TY##B (pg, x, data);                                        
\
+  }                                                                    \
+  void                                                                 \
+  st1_##TY##W##B##_2 (TYPE *x, sv##TYPE data)                          \
+  {                                                                    \
+    svbool_t pg = svptrue_pat_b##B ((enum svpattern) (W > 8 ? 9 : W)); \
+    return svst1_##TY##B (pg, x, data);                                        
\
+  }                                                                    \
+
+#define TEST64(TYPE, TY, B)                            \
+  TEST (TYPE, TY, 1, B)                                        \
+  TEST (TYPE, TY, 2, B)                                        \
+
+#define TEST32(TYPE, TY, B)                            \
+  TEST64 (TYPE, TY, B)                                 \
+  TEST (TYPE, TY, 4, B)                                        \
+
+#define TEST16(TYPE, TY, B)                            \
+  TEST32 (TYPE, TY, B)                                 \
+  TEST (TYPE, TY, 8, B)                                        \
+
+#define TEST8(TYPE, TY, B)                             \
+  TEST16 (TYPE, TY, B)                                 \
+  TEST (TYPE, TY, 16, B)
+
+#define T(TYPE, TY, B)                 \
+  TEST##B (TYPE, TY, B)
+
+T (bfloat16_t, bf, 16)
+T (float16_t, f, 16)
+T (float32_t, f, 32)
+T (float64_t, f, 64)
+T (int8_t, s, 8)
+T (int16_t, s, 16)
+T (int32_t, s, 32)
+T (int64_t, s, 64)
+T (uint8_t, u, 8)
+T (uint16_t, u, 16)
+T (uint32_t, u, 32)
+T (uint64_t, u, 64)
+
+/* { dg-final { scan-assembler-times {\tldr\tq0, \[x0\]} 24 } } */
+/* { dg-final { scan-assembler-times {\tldr\td0, \[x0\]} 24 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts0, \[x0\]} 14 } } */
+/* { dg-final { scan-assembler-times {\tldr\th0, \[x0\]} 12 } } */
+/* { dg-final { scan-assembler-times {\tldr\tb0, \[x0\]} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tstr\tq0, \[x0\]} 24 } } */
+/* { dg-final { scan-assembler-times {\tstr\td0, \[x0\]} 24 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts0, \[x0\]} 14 } } */
+/* { dg-final { scan-assembler-times {\tstr\th0, \[x0\]} 12 } } */
+/* { dg-final { scan-assembler-times {\tstr\tb0, \[x0\]} 4 } } */
+
+/* The optimization is not applied to VNx8HFmode and VNx8BFmode with a
+   VL2 predicate, because there are no move patterns defined for V2HF
+   and V2BF. */
+/* { dg-final { scan-assembler-times {\tptrue\tp([0-7]).h, vl2\n\tld1h\tz0.h, 
p\1/z, \[x0\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp([0-7]).h, vl2\n\tst1h\tz0.h, 
p\1, \[x0\]} 4 } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_7.c 
b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c
index a66a20d21f6..ab2fa3646fc 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c
@@ -19,7 +19,7 @@
 
 TEST_ALL (ADD_LOOP)
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl8\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl8\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x0\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+, \[x0\]} 1 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_9.c 
b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c
index dd3f404ab39..99940dd73fa 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_9.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c
@@ -19,7 +19,7 @@
 
 TEST_ALL (ADD_LOOP)
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl16\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\tq[0-9]+\, \[x0\]} 1 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
-- 
2.34.1

Attachment: smime.p7s
Description: S/MIME cryptographic signature

Reply via email to