[clang] [llvm] [Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2 instruction (PR #97755)

via cfe-commits Fri, 05 Jul 2024 02:25:03 -0700

https://github.com/CarolineConcatto updated 
https://github.com/llvm/llvm-project/pull/97755


>From 22f5bb7cab1673632f1fa5438a35f861c16d63b2 Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.conca...@arm.com>
Date: Thu, 4 Jul 2024 17:10:36 +0000
Subject: [PATCH 1/2] [Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2
 instruction

This patch adds these intrinsics:

// Variants are also available for: _s8
  svuint8x4_t svluti4_zt_u8_x4(uint64_t zt0, svuint8x2_t zn) __arm_streaming 
__arm_in("zt0");

according to PR#324[1]
[1]ARM-software/acle#324
---
 clang/include/clang/Basic/arm_sme.td          |  5 ++
 .../acle_sme2_luti4_zt.c                      | 82 +++++++++++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp |  5 ++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  6 ++
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 19 ++++-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  2 +-
 .../AArch64/sme2-intrinsics-write-zt.ll       | 17 ++++
 7 files changed, 132 insertions(+), 4 deletions(-)
 create mode 100644 
clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
 create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll

diff --git a/clang/include/clang/Basic/arm_sme.td 
b/clang/include/clang/Basic/arm_sme.td
index ce8908f566f2fd..e4a61caae733ec 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -817,4 +817,9 @@ multiclass ZAReadzArray<string vg_num>{
 
 defm SVREADZ_VG2 :  ZAReadzArray<"2">;
 defm SVREADZ_VG4 :  ZAReadzArray<"4">;
+
+let SMETargetGuard = "sme2,sme-lutv2" in {
+  def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2", "cUc", MergeNone, 
"aarch64_sme_luti4_zt_x4", [IsStreaming, IsInOutZT0], [ImmCheck<0, 
ImmCheck0_0>]>;
+}
+
 } // let SVETargetGuard = InvalidMode
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c 
b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
new file mode 100644
index 00000000000000..2e7cd0939f516b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
@@ -0,0 +1,82 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang_cc1  -triple aarch64-none-linux-gnu -target-feature +bf16 
-target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 
-Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -x c++  -triple aarch64-none-linux-gnu -target-feature 
+bf16 -target-feature +sme -target-feature +sme2 -target-feature  +sme-lutv2  
-O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS   -triple aarch64-none-linux-gnu 
-target-feature +bf16 -target-feature +sme -target-feature +sme2 
-target-feature  +sme-lutv2  -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++  -triple 
aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme 
-target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm 
-o - %s | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1  -triple aarch64-none-linux-gnu -target-feature +bf16 
-target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -S 
-Werror -Wall -o /dev/null %s
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_u8_x4(
+// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr 
#[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 
x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], 
<vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 32)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x 
i8> [[TMP9]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP10]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> 
@_Z19test_luti4_zt_u8_x411svuint8x2_t(
+// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr 
#[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], 
<vscale x 16 x i8> [[TMP1]])
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP3]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 32)
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x 
i8> [[TMP9]], i64 48)
+// CHECK-CXX-NEXT:    ret <vscale x 64 x i8> [[TMP10]]
+//
+svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op)  __arm_streaming 
__arm_in("zt0") {
+  return svluti4_zt_u8_x4(0, op);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_s8_x4(
+// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 
x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], 
<vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 32)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x 
i8> [[TMP9]], i64 48)
+// CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP10]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> 
@_Z19test_luti4_zt_s8_x410svint8x2_t(
+// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr 
#[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 
16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } 
@llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], 
<vscale x 16 x i8> [[TMP1]])
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x 
i8> [[TMP3]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x 
i8> [[TMP5]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x 
i8> [[TMP7]], i64 32)
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, 
<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x 
i8> [[TMP9]], i64 48)
+// CHECK-CXX-NEXT:    ret <vscale x 64 x i8> [[TMP10]]
+//
+svint8x4_t test_luti4_zt_s8_x4(svint8x2_t op)  __arm_streaming __arm_in("zt0") 
{
+  return svluti4_zt_s8_x4(0, op);
+}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp 
b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 5de97649af5d3a..567273ab141f25 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, 
svuint16_t z_u16,
   svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error 
{{argument value 4 is outside the valid range [0, 3]}}
   svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error 
{{argument value 4 is outside the valid range [0, 3]}}
 }
+
+void test_rluti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_inout("zt0") {
+  // Check Zt tile 0
+  svluti4_zt_u8_x4(1, op);  // expected-error {{argument value 1 is outside 
the valid range [0, 0]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6f3694cf952d47..e65ebd5207d0af 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3678,6 +3678,12 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
                             [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, 
IntrReadMem]>;
+
+  def int_aarch64_sme_luti4_zt_x4
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
+                            [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+                            [ImmArg<ArgIndex<0>>, IntrNoMem, 
IntrHasSideEffects]>;
+
 }
 
 // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp 
b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 59cfd8d6c27d25..9874a20fa20d4f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -410,7 +410,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   }
 
   void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
-                             uint32_t MaxImm);
+                             uint32_t MaxImm, bool IsMultiVector = false);
 
   template <unsigned MaxIdx, unsigned Scale>
   bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
@@ -1896,15 +1896,23 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, 
unsigned NumVecs,
 
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
                                                 unsigned NumOutVecs,
-                                                unsigned Opc, uint32_t MaxImm) 
{
+                                                unsigned Opc, uint32_t MaxImm,
+                                                bool IsMultiVector) {
   if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4)))
     if (Imm->getZExtValue() > MaxImm)
       return;
 
   SDValue ZtValue;
+  SmallVector<SDValue, 4> Ops;
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
-  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+  Ops.push_back(ZtValue);
+  if (IsMultiVector) {
+    Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
+  } else {
+    Ops.push_back(Node->getOperand(3));
+    Ops.push_back(Node->getOperand(4));
+  }
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
 
@@ -5415,6 +5423,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectMultiVectorLuti(Node, 2, Opc, 3);
       return;
     }
+    case Intrinsic::aarch64_sme_luti4_zt_x4: {
+      // Does not have immediate but it has 2ZPR input
+      SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z, 0, true);
+      return;
+    }
     }
   } break;
   case ISD::INTRINSIC_WO_CHAIN: {
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td 
b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 709a98d3a8cb4d..111deefec860fc 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -936,7 +936,7 @@ defm FAMIN_4Z4Z : 
sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;
 
 let Predicates = [HasSME2, HasSME_LUTv2] in {
 defm MOVT : sme2_movt_zt_to_zt<"movt",  0b0011111>;
-def LUTI4_4ZZT2Z    : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
+def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
 } //[HasSME2, HasSME_LUTv2]
 
 let Predicates = [HasSME2p1, HasSME_LUTv2] in {
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll 
b/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll
new file mode 100644
index 00000000000000..778f31194baf45
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 
16 x i8>}  @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 
{
+; CHECK-LABEL: test_luti4_zt_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    luti4 { z0.b - z3.b }, zt0, { z0, z1 }
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, 
<vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x 
i8> %v0, <vscale x 16 x i8> %v1)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 
16 x i8>} %res
+}
+
+attributes #0 = { "target-features"="+sme2,+sme-lutv2"}

>From 1163ccfe900e6c0d940d03262d2ca14af6d21adc Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.conca...@arm.com>
Date: Fri, 5 Jul 2024 08:32:01 +0000
Subject: [PATCH 2/2] Lut should have index always as unsigned

---
 clang/include/clang/Basic/arm_sme.td                          | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c | 4 ++--
 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sme.td 
b/clang/include/clang/Basic/arm_sme.td
index e4a61caae733ec..314a97248272c1 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -819,7 +819,7 @@ defm SVREADZ_VG2 :  ZAReadzArray<"2">;
 defm SVREADZ_VG4 :  ZAReadzArray<"4">;
 
 let SMETargetGuard = "sme2,sme-lutv2" in {
-  def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2", "cUc", MergeNone, 
"aarch64_sme_luti4_zt_x4", [IsStreaming, IsInOutZT0], [ImmCheck<0, 
ImmCheck0_0>]>;
+  def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, 
"aarch64_sme_luti4_zt_x4", [IsStreaming, IsInOutZT0], [ImmCheck<0, 
ImmCheck0_0>]>;
 }
 
 } // let SVETargetGuard = InvalidMode
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c 
b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
index 2e7cd0939f516b..47f36a8b000c07 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
@@ -61,7 +61,7 @@ svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op)  
__arm_streaming __arm_in("zt0")
 // CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x 
i8> [[TMP9]], i64 48)
 // CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP10]]
 //
-// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> 
@_Z19test_luti4_zt_s8_x410svint8x2_t(
+// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> 
@_Z19test_luti4_zt_s8_x411svuint8x2_t(
 // CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr 
#[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
 // CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> 
@llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0)
@@ -77,6 +77,6 @@ svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op)  
__arm_streaming __arm_in("zt0")
 // CHECK-CXX-NEXT:    [[TMP10:%.*]] = tail call <vscale x 64 x i8> 
@llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x 
i8> [[TMP9]], i64 48)
 // CHECK-CXX-NEXT:    ret <vscale x 64 x i8> [[TMP10]]
 //
-svint8x4_t test_luti4_zt_s8_x4(svint8x2_t op)  __arm_streaming __arm_in("zt0") 
{
+svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op)  __arm_streaming 
__arm_in("zt0") {
   return svluti4_zt_s8_x4(0, op);
 }
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp 
b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 567273ab141f25..03f24f79b9dbcf 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -351,7 +351,7 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, 
svuint16_t z_u16,
   svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error 
{{argument value 4 is outside the valid range [0, 3]}}
 }
 
-void test_rluti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_inout("zt0") {
+void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_inout("zt0") {
   // Check Zt tile 0
   svluti4_zt_u8_x4(1, op);  // expected-error {{argument value 1 is outside 
the valid range [0, 0]}}
 }

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2 instruction (PR #97755)

Reply via email to