[clang] [llvm] [AArch64][SME] Fix multi vector cvt builtins (PR #77656)

Matthew Devereau via cfe-commits Thu, 11 Jan 2024 02:43:33 -0800

https://github.com/MDevereau updated 
https://github.com/llvm/llvm-project/pull/77656


>From 67be98b05d771dabe11af54b69532641fa548fb1 Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.dever...@arm.com>
Date: Wed, 10 Jan 2024 17:58:30 +0000
Subject: [PATCH 1/2] [AArch64][SME] Fix multi vector cvt builtins

This fixes cvt multi vector builtins that erroneously had inverted
return vectors and vector parameters. This caused the incorrect
instructions to be emitted.
---
 clang/include/clang/Basic/arm_sve.td          | 18 ++---
 .../aarch64-sme2-intrinsics/acle_sme2_cvt.c   | 32 ++++----
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  8 +-
 .../CodeGen/AArch64/sme2-intrinsics-cvt.ll    | 80 +++++++++----------
 4 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td 
b/clang/include/clang/Basic/arm_sve.td
index 7f80fb0386cc77..301ef67b3d40bc 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2238,15 +2238,15 @@ let TargetGuard = "sme2" in {
   def SVCVT_F16_X2  : SInst<"svcvt_f16[_f32_x2]", "e2", "f", MergeNone, 
"aarch64_sve_fcvt_x2", [IsStreaming],[]>;
   def SVCVT_BF16_X2 : SInst<"svcvt_bf16[_f32_x2]", "$2", "f", MergeNone, 
"aarch64_sve_bfcvt_x2", [IsOverloadNone, IsStreaming],[]>;
 
-  def SVCVT_F32_U32_X2 : SInst<"svcvt_{d}[_u32_x2]", "2.d2.u", "f", MergeNone, 
"aarch64_sve_fcvtu_x2", [IsStreaming], []>;
-  def SVCVT_U32_F32_X2 : SInst<"svcvt_u32[_{d}_x2]", "2.u2.d", "f", MergeNone, 
"aarch64_sve_ucvtf_x2", [IsStreaming], []>;
-  def SVCVT_F32_S32_X2 : SInst<"svcvt_{d}[_s32_x2]", "2.d2.x", "f", MergeNone, 
"aarch64_sve_fcvts_x2", [IsStreaming], []>;
-  def SVCVT_S32_F32_X2 : SInst<"svcvt_s32[_{d}_x2]", "2.x2.d", "f", MergeNone, 
"aarch64_sve_scvtf_x2", [IsStreaming], []>;
-
-  def SVCVT_F32_U32_X4 : SInst<"svcvt_{d}[_u32_x4]", "4.d4.u", "f", MergeNone, 
"aarch64_sve_fcvtu_x4", [IsStreaming], []>;
-  def SVCVT_U32_F32_X4 : SInst<"svcvt_u32[_{d}_x4]", "4.u4.d", "f", MergeNone, 
"aarch64_sve_ucvtf_x4", [IsStreaming], []>;
-  def SVCVT_F32_S32_X4 : SInst<"svcvt_{d}[_s32_x4]", "4.d4.x", "f", MergeNone, 
"aarch64_sve_fcvts_x4", [IsStreaming], []>;
-  def SVCVT_S32_F32_X4 : SInst<"svcvt_s32[_{d}_x4]", "4.x4.d", "f", MergeNone, 
"aarch64_sve_scvtf_x4", [IsStreaming], []>;
+  def SVCVT_F32_U32_X2 : SInst<"svcvt_{d}[_u32_x2]", "2.d2.u", "f", MergeNone, 
"aarch64_sve_ucvtf_x2", [IsStreaming], []>;
+  def SVCVT_U32_F32_X2 : SInst<"svcvt_u32[_{d}_x2]", "2.u2.d", "f", MergeNone, 
"aarch64_sve_fcvtu_x2", [IsStreaming], []>;
+  def SVCVT_F32_S32_X2 : SInst<"svcvt_{d}[_s32_x2]", "2.d2.x", "f", MergeNone, 
"aarch64_sve_scvtf_x2", [IsStreaming], []>;
+  def SVCVT_S32_F32_X2 : SInst<"svcvt_s32[_{d}_x2]", "2.x2.d", "f", MergeNone, 
"aarch64_sve_fcvts_x2", [IsStreaming], []>;
+
+  def SVCVT_F32_U32_X4 : SInst<"svcvt_{d}[_u32_x4]", "4.d4.u", "f", MergeNone, 
"aarch64_sve_ucvtf_x4", [IsStreaming], []>;
+  def SVCVT_U32_F32_X4 : SInst<"svcvt_u32[_{d}_x4]", "4.u4.d", "f", MergeNone, 
"aarch64_sve_fcvtu_x4", [IsStreaming], []>;
+  def SVCVT_F32_S32_X4 : SInst<"svcvt_{d}[_s32_x4]", "4.d4.x", "f", MergeNone, 
"aarch64_sve_scvtf_x4", [IsStreaming], []>;
+  def SVCVT_S32_F32_X4 : SInst<"svcvt_s32[_{d}_x4]", "4.x4.d", "f", MergeNone, 
"aarch64_sve_fcvts_x4", [IsStreaming], []>;
 }
 
 //
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c 
b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index a3ee7d2092f79f..f596b3167f2b82 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -59,7 +59,7 @@ svbfloat16_t test_cvt_bf16_x2(svfloat32x2_t zn)  
__arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float> } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP2]], 1
@@ -70,7 +70,7 @@ svbfloat16_t test_cvt_bf16_x2(svfloat32x2_t zn)  
__arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float> } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP2]], 1
@@ -85,7 +85,7 @@ svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn)  
__arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float> } @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float> } [[TMP2]], 1
@@ -96,7 +96,7 @@ svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn)  
__arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float> } @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], 
<vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> 
@llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x 
float> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float> } [[TMP2]], 1
@@ -111,7 +111,7 @@ svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn) 
__arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32> } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP2]], 1
@@ -122,7 +122,7 @@ svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn) 
__arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32> } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP2]], 1
@@ -137,7 +137,7 @@ svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn)  
__arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32> } @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32> } @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32> } [[TMP2]], 1
@@ -148,7 +148,7 @@ svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn)  
__arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32> } @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32> } @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], 
<vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> 
@llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x 
i32> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32> } [[TMP2]], 1
@@ -166,7 +166,7 @@ svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn) 
__arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 
x float> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -183,7 +183,7 @@ svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn) 
__arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 
x float> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -204,7 +204,7 @@ svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn) 
__arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 
x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 
x float> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -221,7 +221,7 @@ svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn) 
__arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> 
@llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale 
x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } 
@llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x 
i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> 
@llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 
x float> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, 
<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -242,7 +242,7 @@ svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn) 
__arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
@@ -259,7 +259,7 @@ svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn) 
__arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
@@ -280,7 +280,7 @@ svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn) 
__arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x 
i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
@@ -297,7 +297,7 @@ svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn) 
__arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> 
@llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 
4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } 
@llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x 
float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> 
@llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x 
i32> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, 
<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 9088168b4c67b1..7dd97b4b42736a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3095,23 +3095,23 @@ let TargetPrefix = "aarch64" in {
                             [llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_FtoI_VG2_Intrinsic
+  class SME2_CVT_ItoF_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_ItoF_VG2_Intrinsic
+  class SME2_CVT_FtoI_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_FtoI_VG4_Intrinsic
+  class SME2_CVT_ItoF_VG4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
                             [LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>,
                              LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_ItoF_VG4_Intrinsic
+  class SME2_CVT_FtoI_VG4_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>,
                              LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll 
b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
index 8e2824122383fd..4222e32ee35d8b 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
@@ -34,19 +34,19 @@ define <vscale x 8 x bfloat> 
@multi_vector_cvt_x2_bf16(<vscale x 4 x float> %unu
 ;
 ; FCVTZS
 ;
-define {<vscale x 4 x float>, <vscale x 4 x float>}  
@multi_vector_cvt_x2_f32_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> 
%zn0, <vscale x 4 x i32> %zn1) {
-; CHECK-LABEL: multi_vector_cvt_x2_f32_s32:
+define {<vscale x 4 x i32>, <vscale x 4 x i32>}  
@multi_vector_cvt_x2_s32_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> 
%zn0, <vscale x 4 x float> %zn1) {
+; CHECK-LABEL: multi_vector_cvt_x2_s32_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    fcvtzs { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x i32>%zn0, <vscale x 4 x 
i32>%zn1)
-  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x 
float> %zn1)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
 }
 
-define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>}  @multi_vector_cvt_x4_f32_s32(<vscale x 4 x i32> %unused, 
<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, 
<vscale x 4 x i32> %zn3) {
-; CHECK-LABEL: multi_vector_cvt_x4_f32_s32:
+define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 
x i32>}  @multi_vector_cvt_x4_s32_f32(<vscale x 4 x float> %unused, <vscale x 4 
x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 
4 x float> %zn3) {
+; CHECK-LABEL: multi_vector_cvt_x4_s32_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov z6.d, z3.d
@@ -54,26 +54,26 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale 
x 4 x float>, <vscale
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    fcvtzs { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x 
float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x 
i32>%zn0, <vscale x 4 x i32>%zn1, <vscale x 4 x i32>%zn2, <vscale x 4 x 
i32>%zn3)
-  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, 
<vscale x 4 x float>} %res
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, 
<vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float> 
%zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x 
float> %zn3)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 
x i32>} %res
 }
 
 ;
 ; FCVTZU
 ;
-define {<vscale x 4 x float>, <vscale x 4 x float>}  
@multi_vector_cvt_x2_f32_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> 
%zn0, <vscale x 4 x i32> %zn1) {
-; CHECK-LABEL: multi_vector_cvt_x2_f32_u32:
+define {<vscale x 4 x i32>, <vscale x 4 x i32>}  
@multi_vector_cvt_x2_u32_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> 
%zn0, <vscale x 4 x float> %zn1) {
+; CHECK-LABEL: multi_vector_cvt_x2_u32_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    fcvtzu { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x i32>%zn0, <vscale x 4 x 
i32>%zn1)
-  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x 
float> %zn1)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
 }
 
-define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, 
<vscale x 4 x float>}  @multi_vector_cvt_x4_f32_u32(<vscale x 4 x i32> %unused, 
<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, 
<vscale x 4 x i32> %zn3) {
-; CHECK-LABEL: multi_vector_cvt_x4_f32_u32:
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 
4 x i32>}  @multi_vector_cvt_x4_u32_f32(<vscale x 4 x float> %unused, <vscale x 
4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale 
x 4 x float> %zn3) {
+; CHECK-LABEL: multi_vector_cvt_x4_u32_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov z6.d, z3.d
@@ -81,26 +81,26 @@ define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale 
x 4 x float>, <vscal
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    fcvtzu { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x 
float>, <vscale x 4 x float>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x 
i32>%zn0, <vscale x 4 x i32>%zn1, <vscale x 4 x i32>%zn2, <vscale x 4 x 
i32>%zn3)
-  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, 
<vscale x 4 x float>} %res
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, 
<vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float> 
%zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x 
float> %zn3)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 
x i32>} %res
 }
 
 ;
 ; SCVTF
 ;
-define {<vscale x 4 x i32>, <vscale x 4 x i32>}  
@multi_vector_cvt_x2_s32_f32(<vscale x 4 x float>%unused, <vscale x 4 x float> 
%zn0, <vscale x 4 x float> %zn1) {
-; CHECK-LABEL: multi_vector_cvt_x2_s32_f32:
+define {<vscale x 4 x float>, <vscale x 4 x float>}  
@multi_vector_cvt_x2_f32_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> 
%zn0, <vscale x 4 x i32> %zn1) {
+; CHECK-LABEL: multi_vector_cvt_x2_f32_s32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    scvtf { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x float>%zn0, <vscale x 4 x 
float>%zn1)
-  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.scvtf.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> 
%zn1)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
-define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 
x i32>}  @multi_vector_cvt_x4_s32_f32(<vscale x 4 x float>%unused, <vscale x 4 
x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 
4 x float> %zn3) {
-; CHECK-LABEL: multi_vector_cvt_x4_s32_f32:
+define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>}  @multi_vector_cvt_x4_f32_s32(<vscale x 4 x i32> %unused, 
<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, 
<vscale x 4 x i32> %zn3) {
+; CHECK-LABEL: multi_vector_cvt_x4_f32_s32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov z6.d, z3.d
@@ -108,26 +108,26 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 
4 x i32>, <vscale x 4 x
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    scvtf { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, 
<vscale x 4 x i32>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x 
float>%zn0, <vscale x 4 x float>%zn1, <vscale x 4 x float>%zn2, <vscale x 4 x 
float>%zn3)
-  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 
x i32>} %res
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x 
float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4i32(<vscale x 4 x 
i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> 
%zn3)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>} %res
 }
 
 ;
 ; UCVTF
 ;
-define {<vscale x 4 x i32>, <vscale x 4 x i32>}  
@multi_vector_cvt_x2_u32_f32(<vscale x 4 x float>%unused, <vscale x 4 x float> 
%zn0, <vscale x 4 x float> %zn1) {
-; CHECK-LABEL: multi_vector_cvt_x2_u32_f32:
+define {<vscale x 4 x float>, <vscale x 4 x float>}  
@multi_vector_cvt_x2_f32_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> 
%zn0, <vscale x 4 x i32> %zn1) {
+; CHECK-LABEL: multi_vector_cvt_x2_f32_u32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    ucvtf { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x float>%zn0, <vscale x 4 x 
float>%zn1)
-  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.ucvtf.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> 
%zn1)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
-define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 
x i32>}  @multi_vector_cvt_x4_u32_f32(<vscale x 4 x float>%unused, <vscale x 4 
x float> %zn0, <vscale x 4 x float> %zn1,<vscale x 4 x float> %zn2, <vscale x 4 
x float> %zn3) {
-; CHECK-LABEL: multi_vector_cvt_x4_u32_f32:
+define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>}  @multi_vector_cvt_x4_f32_u32(<vscale x 4 x i32> %unused, 
<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,<vscale x 4 x i32> %zn2, 
<vscale x 4 x i32> %zn3) {
+; CHECK-LABEL: multi_vector_cvt_x4_f32_u32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov z6.d, z3.d
@@ -135,17 +135,17 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 
4 x i32>, <vscale x 4 x
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    ucvtf { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, 
<vscale x 4 x i32>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x 
float>%zn0, <vscale x 4 x float>%zn1, <vscale x 4 x float>%zn2, <vscale x 4 x 
float>%zn3)
-  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 
x i32>} %res
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x 
float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4i32(<vscale x 4 x 
i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> 
%zn3)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>} %res
 }
 
 declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x 
float>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, 
<vscale x 4 x float>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x 
i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x 
i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 
4 x i32>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x 
float>,<vscale x 4 x float>,<vscale x 4 x float>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 
4 x i32>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x 
float>,<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} 
@llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.scvtf.x2.nxv4i32(<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} 
@llvm.aarch64.sve.ucvtf.x2.nxv4i32(<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 
4 x i32>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x 
float>,<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 
4 x i32>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x 
float>,<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4i32(<vscale x 4 x 
i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, 
<vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4i32(<vscale x 4 x 
i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)

>From 03b0a011d5613b1f5633f0e52ebba3ee806f9b9a Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.dever...@arm.com>
Date: Thu, 11 Jan 2024 10:39:02 +0000
Subject: [PATCH 2/2] Rename VG2 intrinsics to X2

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td | 24 +++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td 
b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7dd97b4b42736a..8c0f06ed2ba527 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3095,23 +3095,23 @@ let TargetPrefix = "aarch64" in {
                             [llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_ItoF_VG2_Intrinsic
+  class SME2_CVT_ItoF_X2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_FtoI_VG2_Intrinsic
+  class SME2_CVT_FtoI_X2_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_ItoF_VG4_Intrinsic
+  class SME2_CVT_ItoF_X4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
                             [LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>,
                              LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_FtoI_VG4_Intrinsic
+  class SME2_CVT_FtoI_X4_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>,
                              LLVMVectorOfBitcastsToInt<0>, 
LLVMVectorOfBitcastsToInt<0>],
                             [llvm_anyvector_ty, LLVMMatchType<0>, 
LLVMMatchType<0>, LLVMMatchType<0>],
@@ -3403,14 +3403,14 @@ let TargetPrefix = "aarch64" in {
   //
   def int_aarch64_sve_fcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
   def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
-  def int_aarch64_sve_fcvts_x2 : SME2_CVT_FtoI_VG2_Intrinsic;
-  def int_aarch64_sve_fcvtu_x2 : SME2_CVT_FtoI_VG2_Intrinsic;
-  def int_aarch64_sve_scvtf_x2 : SME2_CVT_ItoF_VG2_Intrinsic;
-  def int_aarch64_sve_ucvtf_x2 : SME2_CVT_ItoF_VG2_Intrinsic;
-  def int_aarch64_sve_fcvts_x4 : SME2_CVT_FtoI_VG4_Intrinsic;
-  def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_VG4_Intrinsic;
-  def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic;
-  def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic;
+  def int_aarch64_sve_fcvts_x2 : SME2_CVT_FtoI_X2_Intrinsic;
+  def int_aarch64_sve_fcvtu_x2 : SME2_CVT_FtoI_X2_Intrinsic;
+  def int_aarch64_sve_scvtf_x2 : SME2_CVT_ItoF_X2_Intrinsic;
+  def int_aarch64_sve_ucvtf_x2 : SME2_CVT_ItoF_X2_Intrinsic;
+  def int_aarch64_sve_fcvts_x4 : SME2_CVT_FtoI_X4_Intrinsic;
+  def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_X4_Intrinsic;
+  def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_X4_Intrinsic;
+  def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_X4_Intrinsic;
 
   //
   // Multi-vector saturating extract narrow

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [AArch64][SME] Fix multi vector cvt builtins (PR #77656)

Reply via email to