[clang] 6859e5a - [CLANG][LLVM][AArch64]Add SME2.1 intrinsics for MOVAZ array to vector (#88901)

via cfe-commits Mon, 01 Jul 2024 00:23:26 -0700

Author: CarolineConcatto
Date: 2024-07-01T08:23:16+01:00
New Revision: 6859e5a169aa235ac04005aaa86ed5ae11372c4c


URL: 
https://github.com/llvm/llvm-project/commit/6859e5a169aa235ac04005aaa86ed5ae11372c4c
DIFF: 
https://github.com/llvm/llvm-project/commit/6859e5a169aa235ac04005aaa86ed5ae11372c4c.diff

LOG: [CLANG][LLVM][AArch64]Add SME2.1 intrinsics for MOVAZ array to vector 
(#88901)

According to the specification in
    ARM-software/acle#309 this adds the intrinsics

    Move and zero multiple ZA single-vector groups to vector registers

    // Variants are also available for _za8_u8, _za16_s16, _za16_u16,
    // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
    // _za64_s64, _za64_u64 and _za64_f64
    svint8x2_t svreadz_za8_s8_vg1x2(uint32_t slice)
    __arm_streaming __arm_inout("za");

    // Variants are also available for _za8_u8, _za16_s16, _za16_u16,
    // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
    // _za64_s64, _za64_u64 and _za64_f64
    svint8x4_t svreadz_za8_s8_vg1x4(uint32_t slice)
    __arm_streaming __arm_inout("za");

Added: 
    

Modified: 
    clang/include/clang/Basic/arm_sme.td
    clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
    llvm/lib/Target/AArch64/SMEInstrFormats.td
    llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_sme.td 
b/clang/include/clang/Basic/arm_sme.td
index ce211f97d1c96..ce8908f566f2f 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -805,4 +805,16 @@ defm SVREADZ_ZA16  : ZAReadzSingle<"za16", "sUshb", 
"aarch64_sme_readz", [ImmChe
 defm SVREADZ_ZA32  : ZAReadzSingle<"za32", "iUif", "aarch64_sme_readz", 
[ImmCheck<0, ImmCheck0_3>]>;
 defm SVREADZ_ZA64  : ZAReadzSingle<"za64", "lUld", "aarch64_sme_readz", 
[ImmCheck<0, ImmCheck0_7>]>;
 defm SVREADZ_ZA128 : ZAReadzSingle<"za128", "csilUcUiUsUlbhfd", 
"aarch64_sme_readz_q", [ImmCheck<0, ImmCheck0_15>]>;
+
+multiclass ZAReadzArray<string vg_num>{
+  let SMETargetGuard = "sme2p1" in {
+    def NAME # _B : SInst<"svreadz_za8_{d}_vg1x" # vg_num, vg_num # "m", 
"cUc", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+    def NAME # _H : SInst<"svreadz_za16_{d}_vg1x" # vg_num, vg_num # "m", 
"sUsbh", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+    def NAME # _S : SInst<"svreadz_za32_{d}_vg1x" # vg_num, vg_num # "m", 
"iUif", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+    def NAME # _D : SInst<"svreadz_za64_{d}_vg1x" # vg_num, vg_num # "m", 
"lUld", MergeNone, "aarch64_sme_readz_x" # vg_num, [IsStreaming, IsInOutZA]>;
+  }
+}
+
+defm SVREADZ_VG2 :  ZAReadzArray<"2">;
+defm SVREADZ_VG4 :  ZAReadzArray<"4">;
 } // let SVETargetGuard = InvalidMode

diff  --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c 
b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
index 7c9067a5ceece..466ca13032633 100644
--- a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
@@ -1823,3 +1823,708 @@ svfloat64_t test_svreadz_hor_za128_f64(uint32_t slice) 
__arm_streaming __arm_ino
 {
    return svreadz_hor_za128_f64(15, slice);
 }
+
+// MOVAZ Array to Tile
+//
+// X2
+//
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_za8_s8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 
x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> 
@_Z22test_svreadz_za8_s8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 
16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svint8x2_t test_svreadz_za8_s8_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za8_s8_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i8> @test_svreadz_za8_u8_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 
x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i8> 
@_Z22test_svreadz_za8_u8_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 
16 x i8> } @llvm.aarch64.sme.readz.x2.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> 
@llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
+//
+svuint8x2_t test_svreadz_za8_u8_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za8_u8_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_za16_s16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x 
i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> 
@_Z24test_svreadz_za16_s16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 
8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svint16x2_t test_svreadz_za16_s16_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_s16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i16> @test_svreadz_za16_u16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x 
i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i16> 
@_Z24test_svreadz_za16_u16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 
8 x i16> } @llvm.aarch64.sme.readz.x2.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> 
@llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
+//
+svuint16x2_t test_svreadz_za16_u16_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_u16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_za32_s32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x 
i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> 
@_Z24test_svreadz_za32_s32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x 
i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svint32x2_t test_svreadz_za32_s32_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za32_s32_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i32> @test_svreadz_za32_u32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x 
i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i32> 
@_Z24test_svreadz_za32_u32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32> } @llvm.aarch64.sme.readz.x2.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x 
i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
+//
+svuint32x2_t test_svreadz_za32_u32_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za32_u32_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_za64_s64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x 
i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> 
@_Z24test_svreadz_za64_s64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 
2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svint64x2_t test_svreadz_za64_s64_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za64_s64_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i64> @test_svreadz_za64_u64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x 
i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i64> 
@_Z24test_svreadz_za64_u64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 
2 x i64> } @llvm.aarch64.sme.readz.x2.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> 
@llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
+//
+svuint64x2_t test_svreadz_za64_u64_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za64_u64_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x bfloat> 
@test_svreadz_za16_bf16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 
8 x bfloat> } @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale 
x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x bfloat> 
@_Z25test_svreadz_za16_bf16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale 
x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
+//
+svbfloat16x2_t test_svreadz_za16_bf16_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_bf16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x half> 
@test_svreadz_za16_f16_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 
x half> } @llvm.aarch64.sme.readz.x2.nxv8f16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 
8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> 
@llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x 
half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 
8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> 
@llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 
x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x half> 
@_Z24test_svreadz_za16_f16_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale 
x 8 x half> } @llvm.aarch64.sme.readz.x2.nxv8f16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, 
<vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x half> 
@llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x 
half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, 
<vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> 
@llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 
x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
+//
+svfloat16x2_t test_svreadz_za16_f16_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_f16_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x float> 
@test_svreadz_za32_f32_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float> } @llvm.aarch64.sme.readz.x2.nxv4f32(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 
x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x float> 
@_Z24test_svreadz_za32_f32_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float> } @llvm.aarch64.sme.readz.x2.nxv4f32(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 
x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_svreadz_za32_f32_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za32_f32_vg1x2(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x double> 
@test_svreadz_za64_f64_x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 
2 x double> } @llvm.aarch64.sme.readz.x2.nxv2f64(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale 
x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> 
@llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x 
double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale 
x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> 
@llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 
x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x double> 
@_Z24test_svreadz_za64_f64_x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, 
<vscale x 2 x double> } @llvm.aarch64.sme.readz.x2.nxv2f64(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, 
<vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x double> 
@llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x 
double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, 
<vscale x 2 x double> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> 
@llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 
x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
+//
+svfloat64x2_t test_svreadz_za64_f64_x2(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za64_f64_vg1x2(slice);
+}
+
+//
+// X4
+//
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_za8_s8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 
x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> 
@_Z22test_svreadz_za8_s8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svint8x4_t test_svreadz_za8_s8_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za8_s8_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_svreadz_za8_u8_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 
x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 64 x i8> 
@_Z22test_svreadz_za8_u8_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.readz.x4.nxv16i8(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x 
i8> [[TMP3]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 48)
+// CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
+//
+svuint8x4_t test_svreadz_za8_u8_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za8_u8_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_za16_s16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x 
i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } 
@llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 
x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 
x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> 
@_Z24test_svreadz_za16_s16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } 
@llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 
x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 
x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svint16x4_t test_svreadz_za16_s16_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_s16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svreadz_za16_u16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x 
i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } 
@llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 
x i16> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 
x i16> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x i16> 
@_Z24test_svreadz_za16_u16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 
8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } 
@llvm.aarch64.sme.readz.x4.nxv8i16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x 
i16> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 
x i16> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP4]], <vscale x 8 
x i16> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i16>, 
<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> 
@llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 
x i16> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
+//
+svuint16x4_t test_svreadz_za16_u16_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_u16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_za32_s32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 
x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 
x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 
x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> 
@_Z24test_svreadz_za32_s32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 
x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 
x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 
x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svint32x4_t test_svreadz_za32_s32_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za32_s32_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i32> @test_svreadz_za32_u32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 
x i32> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 
x i32> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 
x i32> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i32> 
@_Z24test_svreadz_za32_u32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sme.readz.x4.nxv4i32(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 
x i32> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP4]], <vscale x 4 
x i32> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 
x i32> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
+//
+svuint32x4_t test_svreadz_za32_u32_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za32_u32_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_za64_s64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x 
i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } 
@llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x 
i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x 
i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> 
@_Z24test_svreadz_za64_s64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } 
@llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x 
i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x 
i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svint64x4_t test_svreadz_za64_s64_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za64_s64_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i64> @test_svreadz_za64_u64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x 
i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } 
@llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x 
i64> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x 
i64> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i64> 
@_Z24test_svreadz_za64_u64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 
2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } 
@llvm.aarch64.sme.readz.x4.nxv2i64(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x 
i64> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x 
i64> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP4]], <vscale x 2 x 
i64> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, 
<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> 
@llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x 
i64> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
+//
+svuint64x4_t test_svreadz_za64_u64_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za64_u64_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x bfloat> 
@test_svreadz_za16_bf16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 
8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } 
@llvm.aarch64.sme.readz.x4.nxv8bf16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale 
x 8 x bfloat> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale 
x 8 x bfloat> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale 
x 8 x bfloat> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x bfloat> 
@_Z25test_svreadz_za16_bf16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } 
@llvm.aarch64.sme.readz.x4.nxv8bf16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 
1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale 
x 8 x bfloat> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 
2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP4]], <vscale 
x 8 x bfloat> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]], 
3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale 
x 8 x bfloat> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
+//
+svbfloat16x4_t test_svreadz_za16_bf16_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_bf16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 32 x half> 
@test_svreadz_za16_f16_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 
x half>, <vscale x 8 x half>, <vscale x 8 x half> } 
@llvm.aarch64.sme.readz.x4.nxv8f16(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 
8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x 
half> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 
8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 
x half> [[TMP3]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 
8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 
x half> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 
8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 
x half> [[TMP7]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 32 x half> 
@_Z24test_svreadz_za16_f16_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale 
x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } 
@llvm.aarch64.sme.readz.x4.nxv8f16(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x half>, 
<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x 
half> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, 
<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 
x half> [[TMP3]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, 
<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP4]], <vscale x 8 
x half> [[TMP5]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x half>, 
<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> 
@llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 
x half> [[TMP7]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
+//
+svfloat16x4_t test_svreadz_za16_f16_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za16_f16_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x float> 
@test_svreadz_za32_f32_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sme.readz.x4.nxv4f32(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 
x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 
4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 
4 x float> [[TMP5]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 
4 x float> [[TMP7]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x float> 
@_Z24test_svreadz_za32_f32_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sme.readz.x4.nxv4f32(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 
x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 
4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP4]], <vscale x 
4 x float> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 
4 x float> [[TMP7]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
+//
+svfloat32x4_t test_svreadz_za32_f32_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za32_f32_vg1x4(slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x double> 
@test_svreadz_za64_f64_x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, <vscale x 
2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } 
@llvm.aarch64.sme.readz.x4.nxv2f64(i32 [[SLICE]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale 
x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x 
double> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale 
x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 
x double> [[TMP3]], i64 2)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale 
x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 2
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 
x double> [[TMP5]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale 
x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 3
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 
x double> [[TMP7]], i64 6)
+// CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x double> 
@_Z24test_svreadz_za64_f64_x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 2 x double>, 
<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } 
@llvm.aarch64.sme.readz.x4.nxv2f64(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, 
<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x 
double> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, 
<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 
1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 
x double> [[TMP3]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, 
<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 
2
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP4]], <vscale x 2 
x double> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, 
<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], 
3
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> 
@llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 
x double> [[TMP7]], i64 6)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
+//
+svfloat64x4_t test_svreadz_za64_f64_x4(uint32_t slice) __arm_streaming 
__arm_inout("za")
+{
+   return svreadz_za64_f64_vg1x4(slice);
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index cbefd2e08a966..6f3694cf952d4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2869,6 +2869,16 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_readz_q_horiz : SME_MOVAZ_TileToVector_Intrinsic;
   def int_aarch64_sme_readz_q_vert  : SME_MOVAZ_TileToVector_Intrinsic;
 
+  def int_aarch64_sme_readz_x2
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+          [llvm_i32_ty],
+          [IntrNoMem, IntrHasSideEffects]>;
+
+  def int_aarch64_sme_readz_x4
+      : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
+          [llvm_i32_ty],
+          [IntrNoMem, IntrHasSideEffects]>;
+
   def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], 
[ImmArg<ArgIndex<0>>]>;
 
   class SME_OuterProduct_Intrinsic

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp 
b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 544eec3ab9cec..59cfd8d6c27d2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -395,8 +395,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   template <unsigned MaxIdx, unsigned Scale>
   void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg,
                              unsigned Op);
-  void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, unsigned Op,
-                              unsigned MaxIdx, unsigned Scale);
+  void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs,
+                              unsigned Op, unsigned MaxIdx, unsigned Scale,
+                              unsigned BaseReg = 0);
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
   template <int64_t Min, int64_t Max>
@@ -2006,9 +2007,14 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode 
*N, unsigned NumVecs,
 
 void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs,
                                                  unsigned Op, unsigned MaxIdx,
-                                                 unsigned Scale) {
+                                                 unsigned Scale, unsigned 
BaseReg) {
+  // Slice can be in 
diff erent positions
+  // The array to vector: llvm.aarch64.sme.readz.<h/v>.<sz>(slice)
+  // The tile to vector: llvm.aarch64.sme.readz.<h/v>.<sz>(tile, slice)
+  SDValue SliceBase = N->getOperand(2);
+  if (BaseReg != AArch64::ZA)
+    SliceBase = N->getOperand(3);
 
-  SDValue SliceBase = N->getOperand(3);
   SDValue Base, Offset;
   if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale))
     return;
@@ -2016,8 +2022,12 @@ void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode 
*N, unsigned NumVecs,
   // See EmitZAInstr
   // DAG cannot select Za tile as an output register with ZReg
   SDLoc DL(N);
-  SDValue Ops[] = {/*TileNum*/ N->getOperand(2), Base, Offset,
-                   /*Chain*/ N->getOperand(0)};
+  SmallVector<SDValue, 6> Ops;
+  if (BaseReg != AArch64::ZA )
+    Ops.push_back(N->getOperand(2));
+  Ops.push_back(Base);
+  Ops.push_back(Offset);
+  Ops.push_back(N->getOperand(0)); //Chain
   SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, 
Ops);
 
   EVT VT = N->getValueType(0);
@@ -5342,6 +5352,16 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       }
       break;
     }
+    case Intrinsic::aarch64_sme_readz_x2: {
+      SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_VG2_2ZMXI_PSEUDO, 7, 1,
+                             AArch64::ZA);
+      return;
+    }
+    case Intrinsic::aarch64_sme_readz_x4: {
+      SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_VG4_4ZMXI_PSEUDO, 7, 1,
+                             AArch64::ZA);
+      return;
+    }
     case Intrinsic::swift_async_context_addr: {
       SDLoc DL(Node);
       SDValue Chain = Node->getOperand(0);

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp 
b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index acce9515e832c..130d1e9683f7e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2995,6 +2995,11 @@ AArch64TargetLowering::EmitZAInstr(unsigned Opc, 
unsigned BaseReg,
     MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
     StartIdx++;
   } else {
+    // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
+    if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
+      MIB.add(MI.getOperand(StartIdx)); // Output ZPR
+      ++StartIdx;
+    }
     MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
   }
   for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)

diff  --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td 
b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 833e6cfcaf6c5..709a98d3a8cb4 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -818,8 +818,8 @@ defm MOVAZ_ZMI  : sme2p1_movaz_tile_to_vec<"movaz", 
int_aarch64_sme_readz_horiz,
                                            int_aarch64_sme_readz_q_horiz, 
int_aarch64_sme_readz_q_vert>;
 defm MOVAZ_2ZMI  : sme2p1_movaz_tile_to_vec_vg2<"movaz">;
 defm MOVAZ_4ZMI  : sme2p1_movaz_tile_to_vec_vg4<"movaz">;
-defm MOVAZ_VG2_2ZM : sme2_mova_array_to_vec_vg2_multi<0b010, "movaz">;
-defm MOVAZ_VG4_4ZM : sme2_mova_array_to_vec_vg4_multi<0b1100, "movaz">;
+defm MOVAZ_VG2_2ZMXI : sme2_movaz_array_to_vec_vg2_multi<"movaz">;
+defm MOVAZ_VG4_4ZMXI : sme2_movaz_array_to_vec_vg4_multi<"movaz">;
 
 defm ZERO_MXI :  sme2p1_zero_matrix<"zero">;
 

diff  --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td 
b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 5bd19a73f03ce..77cf5cb56728b 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -117,6 +117,15 @@ class sme2_movez_to_tile_pseudo<string name, Operand 
tile_imm, Operand imm_ty, R
   let SMEMatrixType = za_flag;
   let usesCustomInserter = 1;
 }
+
+class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, 
RegisterOperand multi_vector_ty,
+                                      SMEMatrixTypeEnum za_flag>
+    : SMEPseudo2Instr<name, 0>,
+      Pseudo<(outs multi_vector_ty:$Zd), (ins MatrixIndexGPR32Op8_11:$Rv, 
index_ty:$imm3), []> {
+  let SMEMatrixType = za_flag;
+  let usesCustomInserter = 1;
+}
+
 
//===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 
//===----------------------------------------------------------------------===//
@@ -4287,7 +4296,7 @@ class sme2_mova_array_to_vec_vg24_multi<bits<4>op, 
RegisterOperand vector_ty,
 // move array to vector, two registers.
 multiclass sme2_mova_array_to_vec_vg2_multi<bits<3> opc, string mnemonic> {
   def NAME : sme2_mova_array_to_vec_vg24_multi<{opc,?}, ZZ_d_mul_r, MatrixOp64,
-                                               mnemonic, "vgx2"> {
+                                               mnemonic, "vgx2">, 
SMEPseudo2Instr<NAME, 1>{
     bits<4> Zd;
     let Inst{4-1} = Zd;
   }
@@ -4359,10 +4368,15 @@ multiclass sme2_mova_array_to_vec_vg2_multi<bits<3> 
opc, string mnemonic> {
   }
 }
 
+multiclass sme2_movaz_array_to_vec_vg2_multi<string mnemonic> {
+  defm NAME : sme2_mova_array_to_vec_vg2_multi<0b010, mnemonic>;
+  def NAME # _PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, 
ZZ_d_mul_r, SMEMatrixArray>;
+}
+
 // move array to vector, four registers
 multiclass sme2_mova_array_to_vec_vg4_multi<bits<4> opc, string mnemonic> {
   def NAME : sme2_mova_array_to_vec_vg24_multi<opc, ZZZZ_d_mul_r, MatrixOp64,
-                                               mnemonic, "vgx4"> {
+                                               mnemonic, "vgx4">, 
SMEPseudo2Instr<NAME, 1> {
     bits<3> Zd;
     let Inst{4-2} = Zd;
   }
@@ -4434,6 +4448,11 @@ multiclass sme2_mova_array_to_vec_vg4_multi<bits<4> opc, 
string mnemonic> {
   }
 }
 
+multiclass sme2_movaz_array_to_vec_vg4_multi<string mnemonic> {
+  defm NAME : sme2_mova_array_to_vec_vg4_multi<0b1100, mnemonic>;
+  def NAME # _PSEUDO : sme2_movaz_array_to_tile_pseudo<NAME, sme_elm_idx0_7, 
ZZZZ_d_mul_r, SMEMatrixArray>;
+}
+
 
//===----------------------------------------------------------------------===//
 // SME2 multi-vec saturating shift right narrow
 class sme2_sat_shift_vector_vg2<string mnemonic, bit op, bit u>

diff  --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll 
b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
index 1a4393521aadc..efcb5dc145797 100644
--- a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
@@ -901,4 +901,224 @@ declare <vscale x 8 x bfloat> 
@llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32, i32)
 declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32, i32)
 declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32, i32)
 declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32, i32)
-attributes #0 = { nounwind "target-features" = "+sme2p1"}
+
+;MOVAZ (array to vector, Multi)
+
+
+;;
+; X2
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_z8_i8_x2(i32 
%slice) #0 {
+; CHECK-LABEL: test_readz_z8_i8_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} 
@llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} 
@llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_z16_i16_x2(i32 
%slice) #0 {
+; CHECK-LABEL: test_readz_z16_i16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} 
@llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} 
@llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_z32_i32_x2(i32 
%slice) #0 {
+; CHECK-LABEL: test_readz_z32_i32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice.max)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_z64_i64_x2(i32 
%slice) #0 {
+; CHECK-LABEL: test_readz_z64_i64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} 
@llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} 
@llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice.max)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res2
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} 
@test_readz_z16_bf16_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z16_bf16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} 
@llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} 
@llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_z16_f16_x2(i32 
%slice) #0 {
+; CHECK-LABEL: test_readz_z16_f16_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} 
@llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} 
@llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_z32_f32_x2(i32 
%slice) #0 {
+; CHECK-LABEL: test_readz_z32_f32_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice.max)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>} 
@test_readz_z64_f64_x2(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z64_f64_x2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
+; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} 
@llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} 
@llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice.max)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res2
+}
+
+;;
+; X4
+;;
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 
16 x i8>} @test_readz_z8_i8_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z8_i8_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, 
<vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, 
<vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice.max)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 
16 x i8>} %res2
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 
8 x i16>} @test_readz_z16_i16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z16_i16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 
<vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, 
<vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice.max)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 
x i16>} %res2
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 
4 x i32>} @test_readz_z32_i32_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z32_i32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, 
<vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, 
<vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice.max)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 
x i32>} %res2
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 
2 x i64>} @test_readz_z64_i64_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z64_i64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, 
<vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, 
<vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice.max)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 
x i64>} %res2
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>} @test_readz_z16_bf16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z16_bf16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x 
bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x 
bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 
%slice.max)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>} %res2
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale 
x 8 x half>} @test_readz_z16_f16_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z16_f16_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, 
<vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, 
<vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice.max)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale 
x 8 x half>} %res2
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, 
<vscale x 4 x float>} @test_readz_z32_f32_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z32_f32_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x 
float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x 
float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice.max)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, 
<vscale x 4 x float>} %res2
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, 
<vscale x 2 x double>} @test_readz_z64_f64_x4(i32 %slice) #0 {
+; CHECK-LABEL: test_readz_z64_f64_x4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
+; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT:    ret
+  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x 
double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 %slice)
+  %slice.max = add i32 %slice, 7
+  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x 
double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 
%slice.max)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, 
<vscale x 2 x double>} %res2
+}
+
+attributes #0 = { "target-features"="+sme2p1" }


        
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 6859e5a - [CLANG][LLVM][AArch64]Add SME2.1 intrinsics for MOVAZ array to vector (#88901)

Reply via email to