[clang] [llvm] [AArch64][SME] Add intrinsics for multi-vector BFCLAMP (PR #93532)

via cfe-commits Tue, 28 May 2024 04:10:03 -0700

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-clang

Author: None (Lukacma)

<details>
<summary>Changes</summary>

According to the specification in
https://github.com/ARM-software/acle/pull/309 this adds the intrinsics

```
  svbfloat16x2_t svclamp[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zn,
                                        svbfloat16_t zm)  __arm_streaming;

  svbfloat16x4_t svclamp[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zn,
                                        svbfloat16_t zm)  __arm_streaming;
```
These are available only  if __ARM_FEATURE_SME_B16B16 is enabled.

---
Full diff: https://github.com/llvm/llvm-project/pull/93532.diff


6 Files Affected:

- (modified) clang/include/clang/Basic/arm_sve.td (+5) 
- (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c (+64) 
- (added) clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp (+13) 
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+2) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+6) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll (+25-1) 


``````````diff
diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index 03570f94de666..078ef576342a7 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2151,6 +2151,11 @@ let TargetGuard = "sme2" in {
   def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "hfd",      
MergeNone, "aarch64_sve_fclamp_single_x4",  [IsStreaming], []>;
 }
 
+let TargetGuard = "sme2,b16b16"in {
+  def SVBFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "b",      
MergeNone, "aarch64_sve_bfclamp_single_x2",  [IsStreaming], []>;
+  def SVBFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "b",      
MergeNone, "aarch64_sve_bfclamp_single_x4",  [IsStreaming], []>;
+}
+
 let TargetGuard = "sme2" in {
 // == ADD (vectors) ==
   def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", 
MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c 
b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 57ea4d2a1ac47..54b480eec97cd 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -745,3 +745,67 @@ svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t 
op1, svfloat32_t op2, svf
 svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, 
svfloat64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3);
 }
+
+// CHECK-LABEL: @test_svclamp_single_bf16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 
0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 
8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x 
bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> 
[[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale 
x 8 x bfloat> [[TMP5]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+//
+// CPP-CHECK-LABEL: 
@_Z27test_svclamp_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 
0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 
8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> 
[[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> 
@llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale 
x 8 x bfloat> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+//
+svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t 
op2, svbfloat16_t op3) __arm_streaming {
+  return SVE_ACLE_FUNC(svclamp, _single_bf16_x2, , )(op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svclamp_single_bf16_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 
0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 
8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } 
@llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], 
<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x 
bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> 
[[OP3:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale 
x 8 x bfloat> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale 
x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale 
x 8 x bfloat> [[TMP9]], i64 16)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 
3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], 
<vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
+//
+// CPP-CHECK-LABEL: 
@_Z27test_svclamp_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 
0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> 
@llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } 
@llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], 
<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x 
bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> 
[[OP3:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 
8 x bfloat> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 
1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale 
x 8 x bfloat> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 
2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale 
x 8 x bfloat> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 
3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> 
@llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], 
<vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
+//
+svbfloat16x4_t test_svclamp_single_bf16_x4(svbfloat16x4_t op1, svbfloat16_t 
op2, svbfloat16_t op3) __arm_streaming {
+  return SVE_ACLE_FUNC(svclamp, _single_bf16_x4, , )(op1, op2, op3);
+}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp 
b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp
new file mode 100644
index 0000000000000..b8f1a4e29c8f1
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp
@@ -0,0 +1,13 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 
-fsyntax-only -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s
+
+#include <arm_sme.h>
+
+void test_b16b16(svbool_t pg, uint64_t u64, int64_t i64, const bfloat16_t 
*const_bf16_ptr, bfloat16_t *bf16_ptr, svbfloat16_t bf16, svbfloat16x2_t 
bf16x2, svbfloat16x3_t bf16x3, svbfloat16x4_t bf16x4) __arm_streaming
+{
+  // expected-error@+1 {{'svclamp_single_bf16_x2' needs target feature 
sme2,b16b16}}
+  svclamp_single_bf16_x2(bf16x2, bf16, bf16);
+  // expected-error@+1 {{'svclamp_single_bf16_x4' needs target feature 
sme2,b16b16}}
+  svclamp_single_bf16_x4(bf16x4, bf16, bf16);
+}
\ No newline at end of file
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 4544cf35fb7b3..57d0dfb698b38 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3472,10 +3472,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_sclamp_single_x2 : 
SME2_VG2_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_uclamp_single_x2 : 
SME2_VG2_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_fclamp_single_x2 : 
SME2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x2 : 
SME2_VG2_Multi_Single_Single_Intrinsic;
 
   def int_aarch64_sve_sclamp_single_x4 : 
SME2_VG4_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_uclamp_single_x4 : 
SME2_VG4_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_fclamp_single_x4 : 
SME2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x4 : 
SME2_VG4_Multi_Single_Single_Intrinsic;
 
   //
   // Multi-vector add/sub and accumulate into ZA
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp 
b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 25f2e4d7c4de6..660675cf8f389 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5738,6 +5738,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FCLAMP_VG2_2Z2Z_D}))
         SelectClamp(Node, 2, Op);
       return;
+    case Intrinsic::aarch64_sve_bfclamp_single_x2:
+      SelectClamp(Node, 2, AArch64::BFCLAMP_VG2_2ZZZ_H);
+      return;
     case Intrinsic::aarch64_sve_sclamp_single_x4:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
@@ -5759,6 +5762,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FCLAMP_VG4_4Z4Z_D}))
         SelectClamp(Node, 4, Op);
       return;
+    case Intrinsic::aarch64_sve_bfclamp_single_x4:
+      SelectClamp(Node, 4, AArch64::BFCLAMP_VG4_4ZZZ_H);
+      return;
     case Intrinsic::aarch64_sve_add_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll 
b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
index 61b67755a3544..7934f831a7e62 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 
-verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 
-mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x 
bfloat> %b, <vscale x 8 x bfloat> %c){
 ; CHECK-LABEL: bfclamp:
@@ -11,3 +11,27 @@ define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> 
%a, <vscale x 8 x bf
 }
 
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x 
bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } 
@test_bfclamp_single_x2_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, 
<vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d){
+; CHECK-LABEL: test_bfclamp_single_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    bfclamp { z0.h, z1.h }, z2.h, z3.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } 
@llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale 
x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } @test_bfclamp_single_x4_f16(<vscale x 8 x bfloat> %a, 
<vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, 
<vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f){
+; CHECK-LABEL: test_bfclamp_single_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def 
$z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def 
$z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def 
$z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def 
$z0_z1_z2_z3
+; CHECK-NEXT:    bfclamp { z0.h - z3.h }, z4.h, z5.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x 
bfloat>, <vscale x 8 x bfloat> } 
@llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale 
x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 
8 x bfloat> %e, <vscale x 8 x bfloat> %f)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, 
<vscale x 8 x bfloat> } %res
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/93532
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [AArch64][SME] Add intrinsics for multi-vector BFCLAMP (PR #93532)

Reply via email to