sagarkulkarni19 updated this revision to Diff 439919.
sagarkulkarni19 edited the summary of this revision.
sagarkulkarni19 added a comment.

Updated testcases and also added the `vnum` variant of the ld1 and st1 
intrinsics.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D127910/new/

https://reviews.llvm.org/D127910

Files:
  clang/include/clang/Basic/TargetBuiltins.h
  clang/include/clang/Basic/arm_sve.td
  clang/lib/Basic/Targets/AArch64.cpp
  clang/lib/Basic/Targets/AArch64.h
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/lib/Headers/CMakeLists.txt
  clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
  clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
  clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
  clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
  clang/utils/TableGen/SveEmitter.cpp
  clang/utils/TableGen/TableGen.cpp
  clang/utils/TableGen/TableGenBackends.h

Index: clang/utils/TableGen/TableGenBackends.h
===================================================================
--- clang/utils/TableGen/TableGenBackends.h
+++ clang/utils/TableGen/TableGenBackends.h
@@ -101,6 +101,8 @@
 void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
+void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+
 void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitMveBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
Index: clang/utils/TableGen/TableGen.cpp
===================================================================
--- clang/utils/TableGen/TableGen.cpp
+++ clang/utils/TableGen/TableGen.cpp
@@ -80,6 +80,7 @@
   GenArmSveBuiltinCG,
   GenArmSveTypeFlags,
   GenArmSveRangeChecks,
+  GenArmSmeHeader,
   GenArmCdeHeader,
   GenArmCdeBuiltinDef,
   GenArmCdeBuiltinSema,
@@ -217,6 +218,8 @@
                    "Generate arm_sve_typeflags.inc for clang"),
         clEnumValN(GenArmSveRangeChecks, "gen-arm-sve-sema-rangechecks",
                    "Generate arm_sve_sema_rangechecks.inc for clang"),
+        clEnumValN(GenArmSmeHeader, "gen-arm-sme-header",
+                   "Generate arm_sme.h for clang"),
         clEnumValN(GenArmMveHeader, "gen-arm-mve-header",
                    "Generate arm_mve.h for clang"),
         clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def",
@@ -434,6 +437,9 @@
   case GenArmSveRangeChecks:
     EmitSveRangeChecks(Records, OS);
     break;
+  case GenArmSmeHeader:
+    EmitSmeHeader(Records, OS);
+    break;
   case GenArmCdeHeader:
     EmitCdeHeader(Records, OS);
     break;
Index: clang/utils/TableGen/SveEmitter.cpp
===================================================================
--- clang/utils/TableGen/SveEmitter.cpp
+++ clang/utils/TableGen/SveEmitter.cpp
@@ -334,6 +334,9 @@
   /// Emit arm_sve.h.
   void createHeader(raw_ostream &o);
 
+  /// Emit arm_sme.h.
+  void createSMEHeader(raw_ostream &o);
+
   /// Emit all the __builtin prototypes and code needed by Sema.
   void createBuiltins(raw_ostream &o);
 
@@ -347,7 +350,9 @@
   void createTypeFlags(raw_ostream &o);
 
   /// Create intrinsic and add it to \p Out
-  void createIntrinsic(Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
+  void createIntrinsic(Record *R,
+                       SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out,
+                       bool IsSME = false);
 };
 
 } // end anonymous namespace
@@ -757,6 +762,11 @@
     NumVectors = 0;
     Signed = true;
     break;
+  case '%':
+    Pointer = true;
+    Void = true;
+    NumVectors = 0;
+    break;
   case 'A':
     Pointer = true;
     ElementBitwidth = Bitwidth = 8;
@@ -989,7 +999,7 @@
 }
 
 void SVEEmitter::createIntrinsic(
-    Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out) {
+    Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out, bool IsSME) {
   StringRef Name = R->getValueAsString("Name");
   StringRef Proto = R->getValueAsString("Prototype");
   StringRef Types = R->getValueAsString("Types");
@@ -1005,6 +1015,9 @@
   for (auto FlagRec : FlagsList)
     Flags |= FlagRec->getValueAsInt("Value");
 
+  bool SMEFlag = Flags & getEnumValueForFlag("IsSME");
+  if (SMEFlag != IsSME)
+    return;
   // Create a dummy TypeSpec for non-overloaded builtins.
   if (Types.empty()) {
     assert((Flags & getEnumValueForFlag("IsOverloadNone")) &&
@@ -1286,11 +1299,85 @@
   OS << "#endif /* __ARM_SVE_H */\n";
 }
 
+void SVEEmitter::createSMEHeader(raw_ostream &OS) {
+  OS << "/*===---- arm_sme.h - ARM SME intrinsics "
+        "-----------------------------------===\n"
+        " *\n"
+        " *\n"
+        " * Part of the LLVM Project, under the Apache License v2.0 with LLVM "
+        "Exceptions.\n"
+        " * See https://llvm.org/LICENSE.txt for license information.\n"
+        " * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception\n"
+        " *\n"
+        " *===-----------------------------------------------------------------"
+        "------===\n"
+        " */\n\n";
+
+  OS << "#ifndef __ARM_SME_H\n";
+  OS << "#define __ARM_SME_H\n\n";
+
+  OS << "#if !defined(__ARM_FEATURE_SME)\n";
+  OS << "#error \"SME support not enabled\"\n";
+  OS << "#else\n\n";
+
+  OS << "#include <arm_sve.h> \n\n";
+
+  OS << "#ifdef  __cplusplus\n";
+  OS << "extern \"C\" {\n";
+  OS << "#endif\n\n";
+
+  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  for (auto *R : RV)
+    createIntrinsic(R, Defs, true);
+
+  // Sort intrinsics in header file by following order/priority similar to SVE:
+  // - Architectural guard
+  // - Class (is intrinsic overloaded or not)
+  // - Intrinsic name
+  std::stable_sort(Defs.begin(), Defs.end(),
+                   [](const std::unique_ptr<Intrinsic> &A,
+                      const std::unique_ptr<Intrinsic> &B) {
+                     auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
+                       return std::make_tuple(I->getGuard(),
+                                              (unsigned)I->getClassKind(),
+                                              I->getName());
+                     };
+                     return ToTuple(A) < ToTuple(B);
+                   });
+
+  StringRef InGuard = "";
+  for (auto &I : Defs) {
+    // Emit #endif/#if pair if needed.
+    if (I->getGuard() != InGuard) {
+      if (!InGuard.empty())
+        OS << "#endif  //" << InGuard << "\n";
+      InGuard = I->getGuard();
+      if (!InGuard.empty())
+        OS << "\n#if " << InGuard << "\n";
+    }
+
+    // Actually emit the intrinsic declaration.
+    I->emitIntrinsic(OS);
+  }
+
+  if (!InGuard.empty())
+    OS << "#endif  //" << InGuard << "\n";
+
+  OS << "#ifdef __cplusplus\n";
+  OS << "} // extern \"C\"\n";
+  OS << "#endif\n\n";
+  OS << "#endif /*__ARM_FEATURE_SME */\n\n";
+  OS << "#endif /* __ARM_SME_H */\n";
+}
+
 void SVEEmitter::createBuiltins(raw_ostream &OS) {
   std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
-  for (auto *R : RV)
+  for (auto *R : RV) {
     createIntrinsic(R, Defs);
+    createIntrinsic(R, Defs, true);
+  }
 
   // The mappings must be sorted based on BuiltinID.
   llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
@@ -1320,8 +1407,10 @@
 void SVEEmitter::createCodeGenMap(raw_ostream &OS) {
   std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
-  for (auto *R : RV)
+  for (auto *R : RV) {
     createIntrinsic(R, Defs);
+    createIntrinsic(R, Defs, true);
+  }
 
   // The mappings must be sorted based on BuiltinID.
   llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
@@ -1353,8 +1442,10 @@
 void SVEEmitter::createRangeChecks(raw_ostream &OS) {
   std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
-  for (auto *R : RV)
+  for (auto *R : RV) {
     createIntrinsic(R, Defs);
+    createIntrinsic(R, Defs, true);
+  }
 
   // The mappings must be sorted based on BuiltinID.
   llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
@@ -1418,6 +1509,10 @@
   SVEEmitter(Records).createHeader(OS);
 }
 
+void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) {
+  SVEEmitter(Records).createSMEHeader(OS);
+}
+
 void EmitSveBuiltins(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createBuiltins(OS);
 }
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -0,0 +1,273 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+  svst1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za16ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+  svst1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za32ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+  svst1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za64ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+  svst1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svst1_hor_vnum_za128ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+  svst1_hor_vnum_za128(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svst1_ver_vnum_za8ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+  svst1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za16ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+  svst1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za32ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+  svst1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za64ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+  svst1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svst1_ver_vnum_za128ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+  svst1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+  svst1_ver_vnum_za128(7, slice_base, 1, pg, ptr, vnum);
+}
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -0,0 +1,209 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: @test_svst1_hor_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_hor_za8(0, slice_base, 0, pg, ptr);
+  svst1_hor_za8(0, slice_base, 15, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za16ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_hor_za16(0, slice_base, 0, pg, ptr);
+  svst1_hor_za16(1, slice_base, 7, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za32ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_hor_za32(0, slice_base, 0, pg, ptr);
+  svst1_hor_za32(3, slice_base, 3, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za64ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_hor_za64(0, slice_base, 0, pg, ptr);
+  svst1_hor_za64(7, slice_base, 1, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst1_hor_za128ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_hor_za128(0, slice_base, 0, pg, ptr);
+  svst1_hor_za128(15, slice_base, 0, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svst1_ver_za8ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_ver_za8(0, slice_base, 0, pg, ptr);
+  svst1_ver_za8(0, slice_base, 15, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za16ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_ver_za16(0, slice_base, 0, pg, ptr);
+  svst1_ver_za16(1, slice_base, 7, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za32ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_ver_za32(0, slice_base, 0, pg, ptr);
+  svst1_ver_za32(3, slice_base, 3, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za64ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_ver_za64(0, slice_base, 0, pg, ptr);
+  svst1_ver_za64(7, slice_base, 1, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst1_ver_za128ju10__SVBool_tPv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) {
+  svst1_ver_za128(0, slice_base, 0, pg, ptr);
+  svst1_ver_za128(15, slice_base, 0, pg, ptr);
+}
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -0,0 +1,273 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+  svld1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+  svld1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+  svld1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+  svld1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+  svld1_hor_vnum_za128(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_hor_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svld1_ver_hor_za8ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], i8* [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+  svld1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za16ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+  svld1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za32ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+  svld1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za64ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+  svld1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svld1_ver_vnum_za128ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, i128* [[TMP0]], i64 [[MULVL]]
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+  svld1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+  svld1_ver_vnum_za128(7, slice_base, 1, pg, ptr, vnum);
+}
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -0,0 +1,209 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: @test_svld1_hor_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_hor_za8(0, slice_base, 0, pg, ptr);
+  svld1_hor_za8(0, slice_base, 15, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_hor_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za16ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_hor_za16(0, slice_base, 0, pg, ptr);
+  svld1_hor_za16(1, slice_base, 7, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_hor_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za32ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_hor_za32(0, slice_base, 0, pg, ptr);
+  svld1_hor_za32(3, slice_base, 3, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_hor_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za64ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_hor_za64(0, slice_base, 0, pg, ptr);
+  svld1_hor_za64(7, slice_base, 1, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_hor_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svld1_hor_za128ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_hor_za128(0, slice_base, 0, pg, ptr);
+  svld1_hor_za128(15, slice_base, 0, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_ver_za8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svld1_ver_za8ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_ver_za8(0, slice_base, 0, pg, ptr);
+  svld1_ver_za8(0, slice_base, 15, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_ver_za16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za16ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_ver_za16(0, slice_base, 0, pg, ptr);
+  svld1_ver_za16(1, slice_base, 7, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_ver_za32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za32ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_ver_za32(0, slice_base, 0, pg, ptr);
+  svld1_ver_za32(3, slice_base, 3, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_ver_za64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za64ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_ver_za64(0, slice_base, 0, pg, ptr);
+  svld1_ver_za64(7, slice_base, 1, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svld1_ver_za128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svld1_ver_za128ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128*
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT:    call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) {
+  svld1_ver_za128(0, slice_base, 0, pg, ptr);
+  svld1_ver_za128(15, slice_base, 0, pg, ptr);
+}
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -306,6 +306,8 @@
   clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h)
   # Generate arm_sve.h
   clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h)
+  # Generate arm_sme.h
+  clang_generate_header(-gen-arm-sme-header arm_sve.td arm_sme.h)
   # Generate arm_bf16.h
   clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h)
   # Generate arm_mve.h
@@ -330,6 +332,7 @@
 
   list(APPEND aarch64_only_generated_files
     "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h"
+    "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h"
     "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h"
     "${output_dir}/arm_neon_sve_bridge.h"
     )
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -4219,6 +4219,10 @@
   llvm::Value *EmitSVEMaskedStore(const CallExpr *,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   unsigned BuiltinID);
+  llvm::Value *EmitTileslice(llvm::Value *Offset, llvm::Value *Base);
+  llvm::Value *EmitSMELoadStore(SVETypeFlags TypeFlags,
+                                llvm::SmallVectorImpl<llvm::Value *> &Ops,
+                                unsigned IntID);
   llvm::Value *EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
                                    SmallVectorImpl<llvm::Value *> &Ops,
                                    unsigned BuiltinID);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -9020,6 +9020,68 @@
   return Store;
 }
 
+Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) {
+  llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int32Ty, false);
+  return Builder.CreateAdd(Base, CastOffset, "tileslice");
+}
+
+Value *CodeGenFunction::EmitSMELoadStore(SVETypeFlags TypeFlags,
+                                         SmallVectorImpl<Value *> &Ops,
+                                         unsigned IntID) {
+  SmallVector<Value *> NewOps;
+  NewOps.push_back(Ops[3]);
+
+  llvm::Type *BasePtrType;
+  switch (IntID) {
+  case Intrinsic::aarch64_sme_ld1h_horiz:
+  case Intrinsic::aarch64_sme_ld1h_vert:
+  case Intrinsic::aarch64_sme_st1h_horiz:
+  case Intrinsic::aarch64_sme_st1h_vert:
+    BasePtrType = Int16Ty;
+    break;
+  case Intrinsic::aarch64_sme_ld1w_horiz:
+  case Intrinsic::aarch64_sme_ld1w_vert:
+  case Intrinsic::aarch64_sme_st1w_horiz:
+  case Intrinsic::aarch64_sme_st1w_vert:
+    BasePtrType = Int32Ty;
+    break;
+  case Intrinsic::aarch64_sme_ld1d_horiz:
+  case Intrinsic::aarch64_sme_ld1d_vert:
+  case Intrinsic::aarch64_sme_st1d_horiz:
+  case Intrinsic::aarch64_sme_st1d_vert:
+    BasePtrType = Int64Ty;
+    break;
+  case Intrinsic::aarch64_sme_ld1q_horiz:
+  case Intrinsic::aarch64_sme_ld1q_vert:
+  case Intrinsic::aarch64_sme_st1q_horiz:
+  case Intrinsic::aarch64_sme_st1q_vert:
+    BasePtrType = llvm::IntegerType::get(getLLVMContext(), 128);
+    break;
+  default:
+    BasePtrType = Int8Ty;
+    break;
+  }
+
+  llvm::Value *BasePtr =
+      Builder.CreatePointerCast(Ops[4], llvm::PointerType::get(BasePtrType, 0));
+
+  // Contains the vnum parameter
+  if (Ops.size() == 6) {
+    Function *StreamingVectorLength =
+        CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb, {});
+    llvm::Value *StreamingVectorLengthCall =
+        Builder.CreateCall(StreamingVectorLength, {});
+    llvm::Value *Mulvl =
+        Builder.CreateMul(StreamingVectorLengthCall, Ops[5], "mulvl");
+    BasePtr = Builder.CreateGEP(BasePtrType, BasePtr, Mulvl);
+  }
+  NewOps.push_back(BasePtr);
+  NewOps.push_back(Ops[0]);
+  NewOps.push_back(EmitTileslice(Ops[2], Ops[1]));
+  Function *F = CGM.getIntrinsic(IntID, {});
+  return Builder.CreateCall(F, NewOps);
+}
+
 // Limit the usage of scalable llvm IR generated by the ACLE by using the
 // sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
 Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
@@ -9120,6 +9182,8 @@
                              TypeFlags.isZExtReturn());
   else if (TypeFlags.isStore())
     return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic);
+  else if (TypeFlags.isSMELoadStore())
+    return EmitSMELoadStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isGatherLoad())
     return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isScatterStore())
Index: clang/lib/Basic/Targets/AArch64.h
===================================================================
--- clang/lib/Basic/Targets/AArch64.h
+++ clang/lib/Basic/Targets/AArch64.h
@@ -49,6 +49,7 @@
   bool HasSVE2SHA3;
   bool HasSVE2SM4;
   bool HasSVE2BitPerm;
+  bool HasSME;
   bool HasMatmulFP64;
   bool HasMatmulFP32;
   bool HasLSE;
Index: clang/lib/Basic/Targets/AArch64.cpp
===================================================================
--- clang/lib/Basic/Targets/AArch64.cpp
+++ clang/lib/Basic/Targets/AArch64.cpp
@@ -540,6 +540,7 @@
   HasSVE2SHA3 = false;
   HasSVE2SM4 = false;
   HasSVE2BitPerm = false;
+  HasSME = false;
   HasMatmulFP64 = false;
   HasMatmulFP32 = false;
   HasLSE = false;
@@ -583,6 +584,11 @@
       HasSVE2 = true;
       HasSVE2BitPerm = true;
     }
+    if (Feature == "+sme") {
+      HasSME = true;
+      HasBFloat16 = true;
+      HasFullFP16 = true;
+    }
     if (Feature == "+f32mm") {
       FPU |= SveMode;
       HasMatmulFP32 = true;
Index: clang/include/clang/Basic/arm_sve.td
===================================================================
--- clang/include/clang/Basic/arm_sve.td
+++ clang/include/clang/Basic/arm_sve.td
@@ -98,6 +98,7 @@
 // N: svfloat64_t
 
 // J: Prefetch type (sv_prfop)
+// %: pointer to void
 // A: pointer to int8_t
 // B: pointer to int16_t
 // C: pointer to int32_t
@@ -205,6 +206,8 @@
 def IsTupleCreate             : FlagType<0x100000000>;
 def IsTupleGet                : FlagType<0x200000000>;
 def IsTupleSet                : FlagType<0x400000000>;
+def IsSME                     : FlagType<0x800000000>;
+def IsSMELoadStore            : FlagType<0x1000000000>;
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {
@@ -542,6 +545,28 @@
   def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalt_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
+def SVLD1_HOR_ZA8 : MInst<"svld1_hor_za8", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1b_horiz">;
+def SVLD1_HOR_ZA16 : MInst<"svld1_hor_za16", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1h_horiz">;
+def SVLD1_HOR_ZA32 : MInst<"svld1_hor_za32", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1w_horiz">;
+def SVLD1_HOR_ZA64 : MInst<"svld1_hor_za64", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1d_horiz">;
+def SVLD1_HOR_ZA128 : MInst<"svld1_hor_za128", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1q_horiz">;
+def SVLD1_VER_ZA8 : MInst<"svld1_ver_za8", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1b_vert">;
+def SVLD1_VER_ZA16 : MInst<"svld1_ver_za16", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1h_vert">;
+def SVLD1_VER_ZA32 : MInst<"svld1_ver_za32", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1w_vert">;
+def SVLD1_VER_ZA64 : MInst<"svld1_ver_za64", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1d_vert">;
+def SVLD1_VER_ZA128 : MInst<"svld1_ver_za128", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1q_vert">;
+
+def SVLD1_HOR_VNUM_ZA8 : MInst<"svld1_hor_vnum_za8", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1b_horiz">;
+def SVLD1_HOR_VNUM_ZA16 : MInst<"svld1_hor_vnum_za16", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1h_horiz">;
+def SVLD1_HOR_VNUM_ZA32 : MInst<"svld1_hor_vnum_za32", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1w_horiz">;
+def SVLD1_HOR_VNUM_ZA64 : MInst<"svld1_hor_vnum_za64", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1d_horiz">;
+def SVLD1_HOR_VNUM_ZA128 : MInst<"svld1_hor_vnum_za128", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1q_horiz">;
+def SVLD1_VER_VNUM_ZA8 : MInst<"svld1_ver_vnum_za8", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1b_vert">;
+def SVLD1_VER_VNUM_ZA16 : MInst<"svld1_ver_vnum_za16", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1h_vert">;
+def SVLD1_VER_VNUM_ZA32 : MInst<"svld1_ver_vnum_za32", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1w_vert">;
+def SVLD1_VER_VNUM_ZA64 : MInst<"svld1_ver_vnum_za64", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1d_vert">;
+def SVLD1_VER_VNUM_ZA128 : MInst<"svld1_ver_vnum_za128", "vnmnPQl", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1q_vert">;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Stores
 
@@ -664,6 +689,28 @@
   def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
 }
 
+def SVST1_HOR_ZA8 : MInst<"svst1_hor_za8", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1b_horiz">;
+def SVST1_HOR_ZA16 : MInst<"svst1_hor_za16", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1h_horiz">;
+def SVST1_HOR_ZA32 : MInst<"svst1_hor_za32", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1w_horiz">;
+def SVST1_HOR_ZA64 : MInst<"svst1_hor_za64", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1d_horiz">;
+def SVST1_HOR_ZA128 : MInst<"svst1_hor_za128", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1q_horiz">;
+def SVST1_VER_ZA8 : MInst<"svst1_ver_za8", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1b_vert">;
+def SVST1_VER_ZA16 : MInst<"svst1_ver_za16", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1h_vert">;
+def SVST1_VER_ZA32 : MInst<"svst1_ver_za32", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1w_vert">;
+def SVST1_VER_ZA64 : MInst<"svst1_ver_za64", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1d_vert">;
+def SVST1_VER_ZA128 : MInst<"svst1_ver_za128", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1q_vert">;
+
+def SVST1_HOR_VNUM_ZA8 : MInst<"svst1_hor_vnum_za8", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1b_horiz">;
+def SVST1_HOR_VNUM_ZA16 : MInst<"svst1_hor_vnum_za16", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1h_horiz">;
+def SVST1_HOR_VNUM_ZA32 : MInst<"svst1_hor_vnum_za32", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1w_horiz">;
+def SVST1_HOR_VNUM_ZA64 : MInst<"svst1_hor_vnum_za64", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1d_horiz">;
+def SVST1_HOR_VNUM_ZA128 : MInst<"svst1_hor_vnum_za128", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1q_horiz">;
+def SVST1_VER_VNUM_ZA8 : MInst<"svst1_ver_vnum_za8", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1b_vert">;
+def SVST1_VER_VNUM_ZA16 : MInst<"svst1_ver_vnum_za16", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1h_vert">;
+def SVST1_VER_VNUM_ZA32 : MInst<"svst1_ver_vnum_za32", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1w_vert">;
+def SVST1_VER_VNUM_ZA64 : MInst<"svst1_ver_vnum_za64", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1d_vert">;
+def SVST1_VER_VNUM_ZA128 : MInst<"svst1_ver_vnum_za128", "vnmnP%l", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1q_vert">;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Prefetches
 
Index: clang/include/clang/Basic/TargetBuiltins.h
===================================================================
--- clang/include/clang/Basic/TargetBuiltins.h
+++ clang/include/clang/Basic/TargetBuiltins.h
@@ -281,6 +281,8 @@
     bool isTupleCreate() const { return Flags & IsTupleCreate; }
     bool isTupleGet() const { return Flags & IsTupleGet; }
     bool isTupleSet() const { return Flags & IsTupleSet; }
+    bool isSME() const { return Flags & IsSME; }
+    bool isSMELoadStore() const { return Flags & IsSMELoadStore; }
 
     uint64_t getBits() const { return Flags; }
     bool isFlagSet(uint64_t Flag) const { return Flags & Flag; }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to