[clang] [llvm] [SME] Add intrinsics for FCVT(wid.) and FCVTL (PR #93202)

via cfe-commits Thu, 23 May 2024 07:45:34 -0700

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: None (Lukacma)

<details>
<summary>Changes</summary>

According to the specification in
https://github.com/ARM-software/acle/pull/309 this adds the intrinsics
```
svfloat32x2_t svcvt_f32[_f16_x2](svfloat16_t zn) __arm_streaming;
svfloat32x2_t svcvtl_f32[_f16_x2](svfloat16_t zn) __arm_streaming;

```
These are available only if __ARM_FEATURE_SME_F16F16 is enabled.

---
Full diff: https://github.com/llvm/llvm-project/pull/93202.diff


7 Files Affected:

- (modified) clang/include/clang/Basic/arm_sve.td (+11) 
- (modified) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c (+22) 
- (added) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c (+40) 
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+13-1) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+6) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll (+10-1) 
- (added) llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll (+11) 


``````````diff
diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index 15340ebb62b36..af0562ad87080 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2265,6 +2265,10 @@ let TargetGuard = "sme2" in {
   def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i",  
MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, 
IsOverloadWhileOrMultiVecCvt], []>;
 }
 
+let TargetGuard = "sme-f16f16" in {
+  def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, 
"aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>;
+}
+
 //
 // Multi-vector floating-point convert from single-precision to interleaved 
half-precision/BFloat16
 //
@@ -2273,6 +2277,13 @@ let TargetGuard = "sme2" in {
   def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, 
"aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>;
 }
 
+//
+//Multi-vector floating-point convert from half-precision to deinterleaved 
single-precision.
+//
+let TargetGuard = "sme-f16f16" in {
+  def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, 
"aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>;
+}
+
 //
 // Multi-vector saturating extract narrow
 //
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c 
b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index 79a11c2ec153e..d117a685bfc29 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -497,3 +497,25 @@ svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) 
__arm_streaming {
 svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn);
 }
+
+// CHECK-LABEL: @test_cvt_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> 
[[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 
x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z15test_cvt_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> 
[[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 
x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+__attribute__((target("sme-f16f16"))) svfloat32x2_t 
test_cvt_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvt_f32,_f16_x2,,)(zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c 
b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
new file mode 100644
index 0000000000000..1142065614b8f
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme 
-target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o 
- %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme 
-target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o 
- -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s 
-check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sme -target-feature +sme-f16f16 -S -disable-O0-optnone -Werror 
-Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | 
FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu 
-target-feature +sme -target-feature +sme-f16f16 -S -disable-O0-optnone -Werror 
-Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | 
FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme 
-target-feature +sme-f16f16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_cvtl_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> 
[[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 
x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z16test_cvtl_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> 
[[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 
x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_cvtl_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvtl_f32,_f16_x2,,)(zn);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index e31e00a9c76f3..7b8eeafec597b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3121,6 +3121,11 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
+  
+  class SME2_CVT_WIDENING_VG2_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                            [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
+  
 
   class SME2_CVT_VG4_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3412,6 +3417,13 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_suvdot_lane_za32_vg1x4 : 
SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
   def int_aarch64_sme_usvdot_lane_za32_vg1x4 : 
SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
 
+
+  //
+  //Multi-vector floating-point convert from half-precision to deinterleaved 
single-precision.
+  //
+  
+  def int_aarch64_sve_fcvtl_widen_x2  : SME2_CVT_WIDENING_VG2_Intrinsic;
+
   //
   // Multi-vector floating-point CVT from single-precision to interleaved 
half-precision/BFloat16
   //
@@ -3431,7 +3443,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_scvtf_x4  : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_ucvtf_x4  : SME2_CVT_X4_Intrinsic;
-
+  def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
   //
   // Multi-vector saturating extract narrow
   //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp 
b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 80272213dd389..6db04c37e8a42 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5713,6 +5713,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_sve_ucvtf_x4:
       SelectCVTIntrinsic(Node, 4, AArch64::UCVTF_4Z4Z_StoS);
       return;
+    case Intrinsic::aarch64_sve_fcvt_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVT_2ZZ_H_S);
+      return;
+    case Intrinsic::aarch64_sve_fcvtl_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVTL_2ZZ_H_S);
+      return;
     case Intrinsic::aarch64_sve_sclamp_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll 
b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
index bc1db878cbd31..611cdcda157e2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s 
| FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs 
< %s | FileCheck %s
 
 ;
 ; FCVT
@@ -139,6 +139,15 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale 
x 4 x float>, <vscale
   ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>} %res
 }
 
+define {<vscale x 4 x float>, <vscale x 4 x float>}  
@multi_vector_cvt_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvt_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x 
float>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, 
<vscale x 4 x float>)
 declare {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x 
float>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll 
b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
new file mode 100644
index 0000000000000..30dc7cbfaea6c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs 
< %s | FileCheck %s
+
+define {<vscale x 4 x float>, <vscale x 4 x float>}  
@multi_vector_cvtl_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvtl_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/93202
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [SME] Add intrinsics for FCVT(wid.) and FCVTL (PR #93202)

Reply via email to