https://github.com/CarolineConcatto updated https://github.com/llvm/llvm-project/pull/97755
>From 22f5bb7cab1673632f1fa5438a35f861c16d63b2 Mon Sep 17 00:00:00 2001 From: Caroline Concatto <caroline.conca...@arm.com> Date: Thu, 4 Jul 2024 17:10:36 +0000 Subject: [PATCH 1/2] [Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2 instruction This patch adds these intrinsics: // Variants are also available for: _s8 svuint8x4_t svluti4_zt_u8_x4(uint64_t zt0, svuint8x2_t zn) __arm_streaming __arm_in("zt0"); according to PR#324[1] [1]ARM-software/acle#324 --- clang/include/clang/Basic/arm_sme.td | 5 ++ .../acle_sme2_luti4_zt.c | 82 +++++++++++++++++++ .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 5 ++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 6 ++ .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 19 ++++- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 +- .../AArch64/sme2-intrinsics-write-zt.ll | 17 ++++ 7 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index ce8908f566f2fd..e4a61caae733ec 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -817,4 +817,9 @@ multiclass ZAReadzArray<string vg_num>{ defm SVREADZ_VG2 : ZAReadzArray<"2">; defm SVREADZ_VG4 : ZAReadzArray<"4">; + +let SMETargetGuard = "sme2,sme-lutv2" in { + def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>]>; +} + } // let SVETargetGuard = InvalidMode diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c new file mode 100644 index 00000000000000..2e7cd0939f516b --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c @@ -0,0 +1,82 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -O2 -S -Werror -Wall -o /dev/null %s +// REQUIRES: aarch64-registered-target + +#include <arm_sme.h> + +// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_u8_x4( +// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) +// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_u8_x411svuint8x2_t( +// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) +// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]] +// +svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { + return svluti4_zt_u8_x4(0, op); +} + +// CHECK-LABEL: define dso_local <vscale x 64 x i8> @test_luti4_zt_s8_x4( +// CHECK-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 +// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 +// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) +// CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_s8_x410svint8x2_t( +// CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 16) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]]) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 0 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[TMP3]], i64 0) +// CHECK-CXX-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 1 +// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i64 16) +// CHECK-CXX-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 2 +// CHECK-CXX-NEXT: [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 32) +// CHECK-CXX-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], 3 +// CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) +// CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]] +// +svint8x4_t test_luti4_zt_s8_x4(svint8x2_t op) __arm_streaming __arm_in("zt0") { + return svluti4_zt_s8_x4(0, op); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp index 5de97649af5d3a..567273ab141f25 100644 --- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp @@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16, svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} } + +void test_rluti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_inout("zt0") { + // Check Zt tile 0 + svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6f3694cf952d47..e65ebd5207d0af 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3678,6 +3678,12 @@ let TargetPrefix = "aarch64" in { : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>; + + def int_aarch64_sme_luti4_zt_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; + } // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 59cfd8d6c27d25..9874a20fa20d4f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -410,7 +410,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { } void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc, - uint32_t MaxImm); + uint32_t MaxImm, bool IsMultiVector = false); template <unsigned MaxIdx, unsigned Scale> bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { @@ -1896,15 +1896,23 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, - unsigned Opc, uint32_t MaxImm) { + unsigned Opc, uint32_t MaxImm, + bool IsMultiVector) { if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4))) if (Imm->getZExtValue() > MaxImm) return; SDValue ZtValue; + SmallVector<SDValue, 4> Ops; if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue)) return; - SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; + Ops.push_back(ZtValue); + if (IsMultiVector) { + Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)})); + } else { + Ops.push_back(Node->getOperand(3)); + Ops.push_back(Node->getOperand(4)); + } SDLoc DL(Node); EVT VT = Node->getValueType(0); @@ -5415,6 +5423,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectMultiVectorLuti(Node, 2, Opc, 3); return; } + case Intrinsic::aarch64_sme_luti4_zt_x4: { + // Does not have immediate but it has 2ZPR input + SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z, 0, true); + return; + } } } break; case ISD::INTRINSIC_WO_CHAIN: { diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 709a98d3a8cb4d..111deefec860fc 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -936,7 +936,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>; let Predicates = [HasSME2, HasSME_LUTv2] in { defm MOVT : sme2_movt_zt_to_zt<"movt", 0b0011111>; -def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">; +def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">; } //[HasSME2, HasSME_LUTv2] let Predicates = [HasSME2p1, HasSME_LUTv2] in { diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll new file mode 100644 index 00000000000000..778f31194baf45 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-write-zt.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 { +; CHECK-LABEL: test_luti4_zt_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 } +; CHECK-NEXT: ret + %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) + ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res +} + +attributes #0 = { "target-features"="+sme2,+sme-lutv2"} >From 1163ccfe900e6c0d940d03262d2ca14af6d21adc Mon Sep 17 00:00:00 2001 From: Caroline Concatto <caroline.conca...@arm.com> Date: Fri, 5 Jul 2024 08:32:01 +0000 Subject: [PATCH 2/2] Lut should have index always as unsigned --- clang/include/clang/Basic/arm_sme.td | 2 +- .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c | 4 ++-- clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index e4a61caae733ec..314a97248272c1 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -819,7 +819,7 @@ defm SVREADZ_VG2 : ZAReadzArray<"2">; defm SVREADZ_VG4 : ZAReadzArray<"4">; let SMETargetGuard = "sme2,sme-lutv2" in { - def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>]>; + def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>]>; } } // let SVETargetGuard = InvalidMode diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c index 2e7cd0939f516b..47f36a8b000c07 100644 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c @@ -61,7 +61,7 @@ svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") // CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP10]] // -// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_s8_x410svint8x2_t( +// CHECK-CXX-LABEL: define dso_local <vscale x 64 x i8> @_Z19test_luti4_zt_s8_x411svuint8x2_t( // CHECK-CXX-SAME: <vscale x 32 x i8> [[OP:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[OP]], i64 0) @@ -77,6 +77,6 @@ svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") // CHECK-CXX-NEXT: [[TMP10:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i64 48) // CHECK-CXX-NEXT: ret <vscale x 64 x i8> [[TMP10]] // -svint8x4_t test_luti4_zt_s8_x4(svint8x2_t op) __arm_streaming __arm_in("zt0") { +svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { return svluti4_zt_s8_x4(0, op); } diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp index 567273ab141f25..03f24f79b9dbcf 100644 --- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp @@ -351,7 +351,7 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16, svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} } -void test_rluti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_inout("zt0") { +void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_inout("zt0") { // Check Zt tile 0 svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}} } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits