https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/118126
>From 5bc5078af32cda3dbcf3ca8dd53b01996ad34ea1 Mon Sep 17 00:00:00 2001 From: Momchil Velikov <momchil.veli...@arm.com> Date: Mon, 25 Nov 2024 17:21:55 +0000 Subject: [PATCH 1/4] [AArch64] Implements FP8 SVE intrinsics for dot-product This patch adds the following intrinsics: * 8-bit floating-point dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svdot[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to single-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT4) || __ARM_FEATURE_SSVE_FP8DOT4 svfloat32_t svdot_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_3, fpm_t fpm); * 8-bit floating-point dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svdot[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point indexed dot product to half-precision. // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8DOT2) || __ARM_FEATURE_SSVE_FP8DOT2 svfloat16_t svdot_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_7, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 19 +++ clang/include/clang/Basic/arm_sve_sme_incl.td | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 11 +- .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 149 ++++++++++++++++++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 23 ++- clang/utils/TableGen/SveEmitter.cpp | 9 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 16 ++ .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 9 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 29 +++- llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll | 41 +++++ 10 files changed, 293 insertions(+), 14 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b9f40faf0b18e6..2c8ca8014387d3 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2476,3 +2476,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in { def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>; def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>; } + +let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in { + // 8-bit floating-point dot product to half-precision (vectors) + def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to half-precision (indexed) + def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} + +let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { + // 8-bit floating-point dot product to single-precision (vectors) + def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point dot product to single-precision (indexed) + def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; +} + diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index de10be7bdce0db..44201b15505599 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -105,6 +105,7 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t +// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 49a4c1ecc825e7..84048a4beac2c5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10719,7 +10719,16 @@ Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) { cast<llvm::VectorType>(Ty)->getElementCount(), Scalar); } -Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) { +Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) { + if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) { +#ifndef NDEBUG + auto *VecTy = cast<llvm::VectorType>(Ty); + ElementCount EC = VecTy->getElementCount(); + assert(EC.isScalar() && VecTy->getElementType() == Int8Ty && + "Only <1 x i8> expected"); +#endif + Scalar = Builder.CreateExtractElement(Scalar, uint64_t(0)); + } return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType())); } diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c new file mode 100644 index 00000000000000..950a19115811ec --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c @@ -0,0 +1,149 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#ifdef __ARM_FEATURE_SME +#include <arm_sme.h> +#else +#include <arm_sve.h> +#endif + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +#ifdef __ARM_FEATURE_SME +#define STREAMING __arm_streaming +#else +#define STREAMING +#endif + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svdot_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z18test_svdot_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svdot_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svdot,_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svdot_n_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z20test_svdot_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +svfloat32_t test_svdot_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svdot,_n_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svdot_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z18test_svdot_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svdot_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svdot,_f16_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svdot_n_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z20test_svdot_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP1]] +// +svfloat16_t test_svdot_n_f16_mf8(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svdot,_n_f16_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svdot_lane_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 3) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z23test_svdot_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 3) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svdot_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svdot_lane,_f32_mf8,_fpm)(zda, zn, zm, 3, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svdot_lane_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z23test_svdot_lane_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svdot_lane_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svdot_lane,_f16_mf8,_fpm)(zda, zn, zm, 7, fpm); +} diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c index e47efccf480433..d76e729b6a39c4 100644 --- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c +++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c @@ -4,7 +4,7 @@ #include <arm_sve.h> -void test_features(svmfloat8_t zn, fpm_t fpm) { +void test_features(svmfloat8_t zn, svmfloat8_t zm, mfloat8_t x, fpm_t fpm) { svcvt1_bf16_mf8_fpm(zn, fpm); // expected-error@-1 {{'svcvt1_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}} svcvt2_bf16_mf8_fpm(zn, fpm); @@ -30,4 +30,25 @@ void test_features(svmfloat8_t zn, fpm_t fpm) { // expected-error@-1 {{'svcvtnb_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}} svcvtnt_mf8_f32_x2_fpm(zn, svcreate2(svundef_f32(), svundef_f32()), fpm); // expected-error@-1 {{'svcvtnt_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}} + + svdot_f32_mf8_fpm(svundef_f32(), zn, zm, fpm); +// expected-error@-1 {{'svdot_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}} + svdot_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm); +// expected-error@-1 {{'svdot_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}} + svdot_f16_mf8_fpm(svundef_f16(), zn, zm, fpm); +// expected-error@-1 {{'svdot_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}} + svdot_n_f16_mf8_fpm(svundef_f16(), zn, x, fpm); +// expected-error@-1 {{'svdot_n_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}} + svdot_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 3, fpm); +// expected-error@-1 {{'svdot_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}} + svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm); +// expected-error@-1 {{'svdot_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}} } + + +void test_imm_range(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) { + svdot_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm); +// expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm); +// expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} +} \ No newline at end of file diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 2d9f5c3381018a..14e5637f62517e 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -253,7 +253,7 @@ class Intrinsic { /// Return true if the intrinsic takes a splat operand. bool hasSplat() const { // These prototype modifiers are described in arm_sve.td. - return Proto.find_first_of("ajfrKLR@") != std::string::npos; + return Proto.find_first_of("ajfrKLR@!") != std::string::npos; } /// Return the parameter index of the splat operand. @@ -262,7 +262,7 @@ class Intrinsic { for (; I < Proto.size(); ++I, ++Param) { if (Proto[I] == 'a' || Proto[I] == 'j' || Proto[I] == 'f' || Proto[I] == 'r' || Proto[I] == 'K' || Proto[I] == 'L' || - Proto[I] == 'R' || Proto[I] == '@') + Proto[I] == 'R' || Proto[I] == '@' || Proto[I] == '!') break; // Multivector modifier can be skipped @@ -910,6 +910,11 @@ void SVEType::applyModifier(char Mod) { Kind = MFloat8; ElementBitwidth = 8; break; + case '!': + Kind = MFloat8; + Bitwidth = ElementBitwidth = 8; + NumVectors = 0; + break; case '.': llvm_unreachable(". is never a type in itself"); break; diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 0a1bd4c923b9b8..14f4b384dfcca6 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3886,6 +3886,22 @@ let TargetPrefix = "aarch64" in { [llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>], [IntrReadMem, IntrInaccessibleMemOnly]>; + // Dot product + class SVE2_FP8_FMLA_FDOT + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [IntrReadMem, IntrInaccessibleMemOnly]>; + + class SVE2_FP8_FMLA_FDOT_Lane + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty], + [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>; + + def int_aarch64_sve_fp8_fdot : SVE2_FP8_FMLA_FDOT; + def int_aarch64_sve_fp8_fdot_lane : SVE2_FP8_FMLA_FDOT_Lane; + class SME2_FP8_CVT_X2_Single_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_nxv16i8_ty], diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 1a5be28dce4a0c..6971aae6dbe5b7 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4423,18 +4423,17 @@ let Predicates = [HasSVE2, HasF8F16MM] in { let Predicates = [HasSSVE_FP8DOT2] in { // FP8 Widening Dot-Product - Indexed Group -defm FDOT_ZZZI_BtoH : sve2_fp8_dot_indexed_h<"fdot">; +defm FDOT_ZZZI_BtoH : sve2_fp8_dot_indexed_h<"fdot", int_aarch64_sve_fp8_fdot_lane>; // FP8 Widening Dot-Product - Group -// TODO: Replace nxv16i8 by nxv16f8 -defm FDOT_ZZZ_BtoH : sve_fp8_dot<0b0, ZPR16, "fdot">; +defm FDOT_ZZZ_BtoH : sve_fp8_dot<0b0, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fp8_fdot>; } // TODO: Replace nxv16i8 by nxv16f8 let Predicates = [HasSSVE_FP8DOT4] in { // FP8 Widening Dot-Product - Indexed Group -defm FDOT_ZZZI_BtoS : sve2_fp8_dot_indexed_s<"fdot">; +defm FDOT_ZZZI_BtoS : sve2_fp8_dot_indexed_s<"fdot", int_aarch64_sve_fp8_fdot_lane>; // FP8 Widening Dot-Product - Group -defm FDOT_ZZZ_BtoS : sve_fp8_dot<0b1, ZPR32, "fdot">; +defm FDOT_ZZZ_BtoS : sve_fp8_dot<0b1, ZPR32, "fdot", nxv4f32, int_aarch64_sve_fp8_fdot>; } let Predicates = [HasSVE2orSME2, HasLUT] in { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 92aad3d2aec48b..cc52306f78a37d 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -9267,10 +9267,16 @@ multiclass sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty, def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>; } -multiclass sve_fp8_dot<bit bf, ZPRRegOp dst_ty, string asm> { - def NAME : sve_float_dot<bf, 0b1, dst_ty, ZPR8, asm>{ +multiclass sve_fp8_dot<bit bf, ZPRRegOp dstrc, string asm, ValueType vt, + SDPatternOperator op> { + def NAME : sve_float_dot<bf, 0b1, dstrc, ZPR8, asm> { let Uses = [FPMR, FPCR]; + + let mayLoad = 1; + let mayStore = 0; } + + def : SVE_3_Op_Pat<vt, op, vt, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>; } class sve_float_dot_indexed<bit bf, ZPRRegOp dst_ty, ZPRRegOp src1_ty, @@ -10953,24 +10959,37 @@ class sve_fp8_dot_indexed<bits<4> opc, ZPRRegOp dst_ty, Operand iop_ty, string m let DestructiveInstType = DestructiveOther; let hasSideEffects = 0; let mayRaiseFPException = 1; + + let mayLoad = 1; + let mayStore = 0; } // FP8 Widening Dot-Product - Indexed Group -multiclass sve2_fp8_dot_indexed_h<string asm>{ - def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH, asm> { +multiclass sve2_fp8_dot_indexed_h<string asm, SDPatternOperator op> { + def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH32b, asm> { bits<3> iop; let Inst{20-19} = iop{2-1}; let Inst{11} = iop{0}; + + let mayLoad = 1; + let mayStore = 0; } + + def : SVE_4_Op_Pat<nxv8f16, op, nxv8f16, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>; } -multiclass sve2_fp8_dot_indexed_s<string asm>{ +multiclass sve2_fp8_dot_indexed_s<string asm, SDPatternOperator op> { def NAME : sve_fp8_dot_indexed<{1, ?, ?, 0}, ZPR32, VectorIndexS32b, asm> { bits<2> iop; let Inst{20-19} = iop{1-0}; + + let mayLoad = 1; + let mayStore = 0; } + + def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>; } // FP8 Look up table diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll new file mode 100644 index 00000000000000..0cead19a74bfd5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2,+fp8,+fp8dot2,+fp8dot4 < %s | FileCheck %s +; RUN: llc -mattr=+sme,+fp8,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define <vscale x 4 x float> @fdot_4way(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fdot_4way: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 4 x float> %r +} + +define <vscale x 8 x half> @fdot_2way(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fdot_2way: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot z0.h, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 8 x half> %r +} + +define <vscale x 4 x float> @fdot_4way_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fdot_4way_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot z0.s, z1.b, z2.b[3] +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3) + ret <vscale x 4 x float> %r +} + +define <vscale x 8 x half> @fdot_2way_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fdot_2way_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot z0.h, z1.b, z2.b[5] +; CHECK-NEXT: ret + %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.lane.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 5) + ret <vscale x 8 x half> %r +} >From cde9701ce5af93e66a87fbb736621d3d07cdf924 Mon Sep 17 00:00:00 2001 From: Momchil Velikov <momchil.veli...@arm.com> Date: Mon, 9 Dec 2024 15:35:33 +0000 Subject: [PATCH 2/4] [fixup] Misc alterations (NFC) --- clang/include/clang/Basic/arm_sve_sme_incl.td | 2 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index 44201b15505599..b3db6f5ecff503 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -88,6 +88,7 @@ include "arm_immcheck_incl.td" // j: element type promoted to 64bits (splat to vector type) // K: element type bitcast to a signed integer (splat to vector type) // L: element type bitcast to an unsigned integer (splat to vector type) +// !: mfloat8_t (splat to svmfloat8_t) // // i: constant uint64_t // k: int32_t @@ -105,7 +106,6 @@ include "arm_immcheck_incl.td" // N: svfloat64_t // $: svbfloat16_t // ~: svmfloat8_t -// !: mfloat8_t (splat to svmfloat8_t) // J: Prefetch type (sv_prfop) diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index cc52306f78a37d..ded7bd2bf3d0b0 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10971,9 +10971,6 @@ multiclass sve2_fp8_dot_indexed_h<string asm, SDPatternOperator op> { let Inst{20-19} = iop{2-1}; let Inst{11} = iop{0}; - - let mayLoad = 1; - let mayStore = 0; } def : SVE_4_Op_Pat<nxv8f16, op, nxv8f16, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>; @@ -10984,9 +10981,6 @@ multiclass sve2_fp8_dot_indexed_s<string asm, SDPatternOperator op> { bits<2> iop; let Inst{20-19} = iop{1-0}; - - let mayLoad = 1; - let mayStore = 0; } def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>; >From f22dfd5aa1b0d8300340c690364629a71e21027c Mon Sep 17 00:00:00 2001 From: Momchil Velikov <momchil.veli...@arm.com> Date: Tue, 26 Nov 2024 18:01:03 +0000 Subject: [PATCH 3/4] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add This patch adds the following intrinsics: * 8-bit floating-point multiply-add long to half-precision (bottom). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalb[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svmlalb[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long to half-precision (bottom, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalb_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long to half-precision (top). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalt[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat16_t svmlalt[_n_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long to half-precision (top, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat16_t svmlalt_lane[_f16_mf8]_fpm(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom bottom). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbb[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlallbb[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom bottom, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbb_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom top). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbt[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlallbt[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (bottom top, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlallbt_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top bottom). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltb[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlalltb[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top bottom, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltb_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top top). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltt[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm); svfloat32_t svmlalltt[_n_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm); * 8-bit floating-point multiply-add long long to single-precision (top top, indexed). // Only if (__ARM_FEATURE_SVE2 && __ARM_FEATURE_FP8FMA) || __ARM_FEATURE_SSVE_FP8FMA svfloat32_t svmlalltt_lane[_f32_mf8]_fpm(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, uint64_t imm0_15, fpm_t fpm); --- clang/include/clang/Basic/arm_sve.td | 31 ++ .../fp8-intrinsics/acle_sve2_fp8_fmla.c | 389 ++++++++++++++++++ .../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 53 ++- llvm/include/llvm/IR/IntrinsicsAArch64.td | 19 + .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 24 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 22 +- llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll | 114 +++++ 7 files changed, 636 insertions(+), 16 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 2c8ca8014387d3..e9396e34adad8f 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2495,3 +2495,34 @@ let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in { def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>; } +let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in { + // 8-bit floating-point multiply-add long to half-precision (bottom) + def SVFMLALB : SInst<"svmlalb[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point multiply-add long to ha_fpmlf-precision (bottom, indexed) + def SVFMLALB_LANE : SInst<"svmlalb_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_15>]>; + + // 8-bit floating-point multiply-add long to half-precision (top) + def SVFMLALT : SInst<"svmlalt[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALT_N : SInst<"svmlalt[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point multiply-add long to half-precision (top, indexed) + def SVFMLALT_LANE : SInst<"svmlalt_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_15>]>; + + // 8-bit floating-point multiply-add long long to single-precision (all top/bottom variants) + def SVFMLALLBB : SInst<"svmlallbb[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALLBB_N : SInst<"svmlallbb[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALLBT : SInst<"svmlallbt[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALLBT_N : SInst<"svmlallbt[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALLTB : SInst<"svmlalltb[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALLTB_N : SInst<"svmlalltb[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALLTT : SInst<"svmlalltt[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode, SetsFPMR]>; + def SVFMLALLTT_N : SInst<"svmlalltt[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode, SetsFPMR]>; + + // 8-bit floating-point multiply-add long long to single-precision (indexed, all top/bottom variants) + def SVFMLALLBB_LANE : SInst<"svmlallbb_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; + def SVFMLALLBT_LANE : SInst<"svmlallbt_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; + def SVFMLALLTB_LANE : SInst<"svmlalltb_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; + def SVFMLALLTT_LANE : SInst<"svmlalltt_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>; +} diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c new file mode 100644 index 00000000000000..425e6a57ffe3ca --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c @@ -0,0 +1,389 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8fma -S -disable-O0-optnone -Werror -Wall -o /dev/null %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -target-feature +ssve-fp8fma -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#ifdef __ARM_FEATURE_SME +#include <arm_sme.h> +#else +#include <arm_sve.h> +#endif + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +#ifdef __ARM_FEATURE_SME +#define STREAMING __arm_streaming +#else +#define STREAMING +#endif + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalb_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z20test_svmlalb_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svmlalb_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalb,_f16_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalb_n_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z22test_svmlalb_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP1]] +// +svfloat16_t test_svmlalb_n_f16_mf8(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalb,_n_f16_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalt_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z20test_svmlalt_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svmlalt_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalt,_f16_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalt_n_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z22test_svmlalt_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP1]] +// +svfloat16_t test_svmlalt_n_f16_mf8(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalt,_n_f16_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalb_lane_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z25test_svmlalb_lane_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svmlalb_lane_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalb_lane,_f16_mf8,_fpm)(zda, zn, zm, 7, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalt_lane_f16_mf8( +// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z25test_svmlalt_lane_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svmlalt_lane_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalt_lane,_f16_mf8,_fpm)(zda, zn, zm, 7, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbb_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlallbb_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlallbb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlallbb,_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbb_n_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlallbb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +svfloat32_t test_svmlallbb_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlallbb,_n_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbt_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZM]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlallbt_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZM]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlallbt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlallbt,_f32_mf8,_fpm)(zda, zm, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbt_n_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlallbt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +svfloat32_t test_svmlallbt_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlallbt,_n_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltb_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlalltb_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlalltb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalltb,_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltb_n_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlalltb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +svfloat32_t test_svmlalltb_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalltb,_n_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltt_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlalltt_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlalltt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalltt,_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltt_n_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlalltt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]]) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]] +// +svfloat32_t test_svmlalltt_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalltt,_n_f32_mf8,_fpm)(zda, zn, zm, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbb_lane_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlallbb_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlallbb_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlallbb_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbt_lane_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlallbt_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlallbt_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlallbt_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltb_lane_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlalltb_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlalltb_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalltb_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm); +} + +// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltt_lane_f32_mf8( +// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlalltt_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +svfloat32_t test_svmlalltt_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING { + return SVE_ACLE_FUNC(svmlalltt_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm); +} diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c index d76e729b6a39c4..192d200eb4910b 100644 --- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c +++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c @@ -43,12 +43,61 @@ void test_features(svmfloat8_t zn, svmfloat8_t zm, mfloat8_t x, fpm_t fpm) { // expected-error@-1 {{'svdot_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}} svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm); // expected-error@-1 {{'svdot_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}} -} + svmlalb_f16_mf8_fpm(svundef_f16(), zn, zm, fpm); + // expected-error@-1 {{'svmlalb_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalb_n_f16_mf8_fpm(svundef_f16(), zn, x, fpm); + // expected-error@-1 {{'svmlalb_n_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalt_f16_mf8_fpm(svundef_f16(), zn, zm, fpm); + // expected-error@-1 {{'svmlalt_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalt_n_f16_mf8_fpm(svundef_f16(), zn, x, fpm); + // expected-error@-1 {{'svmlalt_n_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalb_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm); + // expected-error@-1 {{'svmlalb_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalt_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm); + // expected-error@-1 {{'svmlalt_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlallbb_f32_mf8_fpm(svundef_f32(), zn, zm, fpm); + // expected-error@-1 {{'svmlallbb_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlallbb_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm); + // expected-error@-1 {{'svmlallbb_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlallbt_f32_mf8_fpm(svundef_f32(), zn, zm, fpm); + // expected-error@-1 {{'svmlallbt_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlallbt_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm); + // expected-error@-1 {{'svmlallbt_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalltb_f32_mf8_fpm(svundef_f32(), zn, zm, fpm); + // expected-error@-1 {{'svmlalltb_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalltb_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm); + // expected-error@-1 {{'svmlalltb_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalltt_f32_mf8_fpm(svundef_f32(), zn, zm, fpm); + // expected-error@-1 {{'svmlalltt_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalltt_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm); + // expected-error@-1 {{'svmlalltt_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlallbb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm); + // expected-error@-1 {{'svmlallbb_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlallbt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm); + // expected-error@-1 {{'svmlallbt_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalltb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm); + // expected-error@-1 {{'svmlalltb_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} + svmlalltt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm); + // expected-error@-1 {{'svmlalltt_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}} +} void test_imm_range(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) { svdot_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm); // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm); // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} -} \ No newline at end of file + + svmlalb_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm); + // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmlalt_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm); + // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}} + svmlallbb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm); + // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmlallbt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm); + // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmlalltb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm); + // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmlalltt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm); + // expected-error@-1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 14f4b384dfcca6..1e5fc4b4cdc10a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3902,6 +3902,25 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_fp8_fdot : SVE2_FP8_FMLA_FDOT; def int_aarch64_sve_fp8_fdot_lane : SVE2_FP8_FMLA_FDOT_Lane; + // Fused multiply-add + def int_aarch64_sve_fp8_fmlalb : SVE2_FP8_FMLA_FDOT; + def int_aarch64_sve_fp8_fmlalb_lane : SVE2_FP8_FMLA_FDOT_Lane; + + def int_aarch64_sve_fp8_fmlalt : SVE2_FP8_FMLA_FDOT; + def int_aarch64_sve_fp8_fmlalt_lane : SVE2_FP8_FMLA_FDOT_Lane; + + def int_aarch64_sve_fp8_fmlallbb : SVE2_FP8_FMLA_FDOT; + def int_aarch64_sve_fp8_fmlallbb_lane : SVE2_FP8_FMLA_FDOT_Lane; + + def int_aarch64_sve_fp8_fmlallbt : SVE2_FP8_FMLA_FDOT; + def int_aarch64_sve_fp8_fmlallbt_lane : SVE2_FP8_FMLA_FDOT_Lane; + + def int_aarch64_sve_fp8_fmlalltb : SVE2_FP8_FMLA_FDOT; + def int_aarch64_sve_fp8_fmlalltb_lane : SVE2_FP8_FMLA_FDOT_Lane; + + def int_aarch64_sve_fp8_fmlalltt : SVE2_FP8_FMLA_FDOT; + def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane; + class SME2_FP8_CVT_X2_Single_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_nxv16i8_ty], diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 6971aae6dbe5b7..4a4412f9df6a1a 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4396,21 +4396,21 @@ defm FAMIN_ZPZZ : sve_fp_bin_pred_hfd<AArch64famin_p>; let Predicates = [HasSSVE_FP8FMA] in { // FP8 Widening Multiply-Add Long - Indexed Group -def FMLALB_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b0, "fmlalb">; -def FMLALT_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b1, "fmlalt">; +defm FMLALB_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b0, "fmlalb", int_aarch64_sve_fp8_fmlalb_lane>; +defm FMLALT_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b1, "fmlalt", int_aarch64_sve_fp8_fmlalt_lane>; // FP8 Widening Multiply-Add Long Group -def FMLALB_ZZZ : sve2_fp8_mla<0b100, ZPR16, "fmlalb">; -def FMLALT_ZZZ : sve2_fp8_mla<0b101, ZPR16, "fmlalt">; +defm FMLALB_ZZZ : sve2_fp8_mla<0b100, ZPR16, "fmlalb", nxv8f16, int_aarch64_sve_fp8_fmlalb>; +defm FMLALT_ZZZ : sve2_fp8_mla<0b101, ZPR16, "fmlalt", nxv8f16, int_aarch64_sve_fp8_fmlalt>; // FP8 Widening Multiply-Add Long Long - Indexed Group -def FMLALLBB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b00, "fmlallbb">; -def FMLALLBT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b01, "fmlallbt">; -def FMLALLTB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b10, "fmlalltb">; -def FMLALLTT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b11, "fmlalltt">; +defm FMLALLBB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b00, "fmlallbb", int_aarch64_sve_fp8_fmlallbb_lane>; +defm FMLALLBT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b01, "fmlallbt", int_aarch64_sve_fp8_fmlallbt_lane>; +defm FMLALLTB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b10, "fmlalltb", int_aarch64_sve_fp8_fmlalltb_lane>; +defm FMLALLTT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b11, "fmlalltt", int_aarch64_sve_fp8_fmlalltt_lane>; // FP8 Widening Multiply-Add Long Long Group -def FMLALLBB_ZZZ : sve2_fp8_mla<0b000, ZPR32, "fmlallbb">; -def FMLALLBT_ZZZ : sve2_fp8_mla<0b001, ZPR32, "fmlallbt">; -def FMLALLTB_ZZZ : sve2_fp8_mla<0b010, ZPR32, "fmlalltb">; -def FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt">; +defm FMLALLBB_ZZZ : sve2_fp8_mla<0b000, ZPR32, "fmlallbb", nxv4f32, int_aarch64_sve_fp8_fmlallbb>; +defm FMLALLBT_ZZZ : sve2_fp8_mla<0b001, ZPR32, "fmlallbt", nxv4f32, int_aarch64_sve_fp8_fmlallbt>; +defm FMLALLTB_ZZZ : sve2_fp8_mla<0b010, ZPR32, "fmlalltb", nxv4f32, int_aarch64_sve_fp8_fmlalltb>; +defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_sve_fp8_fmlalltt>; } // End HasSSVE_FP8FMA let Predicates = [HasSVE2, HasF8F32MM] in { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index ded7bd2bf3d0b0..b317c7cbc85488 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10844,7 +10844,7 @@ multiclass sve2_fp8_down_cvt_single_top<bits<2> opc, string mnemonic, RegisterOp // FP8 Widening Multiply-Add Long - Indexed Group class sve2_fp8_mla_long_by_indexed_elem<bit T, string mnemonic> : I<(outs ZPR16:$Zda), - (ins ZPR16:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB:$imm4), + (ins ZPR16:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB32b:$imm4), mnemonic, "\t$Zda, $Zn, $Zm$imm4", "", []>, Sched<[]>{ bits<5> Zda; @@ -10866,6 +10866,12 @@ class sve2_fp8_mla_long_by_indexed_elem<bit T, string mnemonic> let Uses = [FPMR, FPCR]; } +multiclass sve2_fp8_mla_long_by_indexed_elem<bit T, string mnemonic, SDPatternOperator op> { + def NAME : sve2_fp8_mla_long_by_indexed_elem<T, mnemonic>; + + def : SVE_4_Op_Pat<nxv8f16, op, nxv8f16, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>; +} + // FP8 Widening Multiply-Add (Long)/(Long Long) Group class sve2_fp8_mla<bits<3>opc, ZPRRegOp dst_ty, string mnemonic> : I<(outs dst_ty:$Zda), @@ -10890,10 +10896,16 @@ class sve2_fp8_mla<bits<3>opc, ZPRRegOp dst_ty, string mnemonic> let Uses = [FPMR, FPCR]; } +multiclass sve2_fp8_mla<bits<3> opc, ZPRRegOp dst_ty, string mnemonic, ValueType vta, SDPatternOperator op> { + def NAME : sve2_fp8_mla<opc, dst_ty, mnemonic>; + + def : SVE_3_Op_Pat<vta, op, vta, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>; +} + // FP8 Widening Multiply-Add Long Long - Indexed Group class sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic> : I<(outs ZPR32:$Zda), - (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB:$imm4), + (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB32b:$imm4), mnemonic, "\t$Zda, $Zn, $Zm$imm4", "", []>, Sched<[]>{ bits<5> Zda; @@ -10915,6 +10927,12 @@ class sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic> let Uses = [FPMR, FPCR]; } +multiclass sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic, SDPatternOperator op> { + def NAME : sve2_fp8_mla_long_long_by_indexed_elem<TT, mnemonic>; + + def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>; +} + // FP8 Matrix Multiply-accumulate Group class sve2_fp8_mmla<bit opc, ZPRRegOp dst_ty, string mnemonic> : I<(outs dst_ty:$Zda), diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll new file mode 100644 index 00000000000000..b9ec7086d7f08d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2,+fp8,+fp8fma < %s | FileCheck %s +; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define <vscale x 8 x half> @fmla_2way_bot(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_2way_bot: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalb z0.h, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 8 x half> %r +} + +define <vscale x 8 x half> @fmla_2way_top(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_2way_top: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalt z0.h, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 8 x half> %r +} + +define <vscale x 8 x half> @fdot_2way_bot_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fdot_2way_bot_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalb z0.h, z1.b, z2.b[3] +; CHECK-NEXT: ret + %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3) + ret <vscale x 8 x half> %r +} + +define <vscale x 8 x half> @fdot_2way_top_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fdot_2way_top_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalt z0.h, z1.b, z2.b[3] +; CHECK-NEXT: ret + %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.lane.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3) + ret <vscale x 8 x half> %r +} + +define <vscale x 4 x float> @fmla_4way_bb(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_bb: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlallbb z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 4 x float> %r +} + +define <vscale x 4 x float> @fmla_4way_bt(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_bt: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlallbt z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 4 x float> %r +} + +define <vscale x 4 x float> @fmla_4way_tb(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_tb: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalltb z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 4 x float> %r +} + +define <vscale x 4 x float> @fmla_4way_tt(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_tt: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalltt z0.s, z1.b, z2.b +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) + ret <vscale x 4 x float> %r +} + +define <vscale x 4 x float> @fmla_4way_bb_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_bb_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlallbb z0.s, z1.b, z2.b[3] +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3) + ret <vscale x 4 x float> %r +} + +define <vscale x 4 x float> @fmla_4way_bt_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_bt_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlallbt z0.s, z1.b, z2.b[3] +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3) + ret <vscale x 4 x float> %r +} + +define <vscale x 4 x float> @fmla_4way_tb_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_tb_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalltb z0.s, z1.b, z2.b[3] +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3) + ret <vscale x 4 x float> %r +} + +define <vscale x 4 x float> @fmla_4way_tt_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +; CHECK-LABEL: fmla_4way_tt_lane: +; CHECK: // %bb.0: +; CHECK-NEXT: fmlalltt z0.s, z1.b, z2.b[3] +; CHECK-NEXT: ret + %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3) + ret <vscale x 4 x float> %r +} + >From a5a79186985603bc2f58c7b98bf2b510f2b0f944 Mon Sep 17 00:00:00 2001 From: Momchil Velikov <momchil.veli...@arm.com> Date: Fri, 13 Dec 2024 09:16:34 +0000 Subject: [PATCH 4/4] [fixup] Fix test function names --- llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll index b9ec7086d7f08d..bbcf671fad7644 100644 --- a/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll +++ b/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll @@ -22,7 +22,7 @@ define <vscale x 8 x half> @fmla_2way_top(<vscale x 8 x half> %a, <vscale x 16 x ret <vscale x 8 x half> %r } -define <vscale x 8 x half> @fdot_2way_bot_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +define <vscale x 8 x half> @fmla_2way_bot_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { ; CHECK-LABEL: fdot_2way_bot_lane: ; CHECK: // %bb.0: ; CHECK-NEXT: fmlalb z0.h, z1.b, z2.b[3] @@ -31,7 +31,7 @@ define <vscale x 8 x half> @fdot_2way_bot_lane(<vscale x 8 x half> %a, <vscale x ret <vscale x 8 x half> %r } -define <vscale x 8 x half> @fdot_2way_top_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { +define <vscale x 8 x half> @fmla_2way_top_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) { ; CHECK-LABEL: fdot_2way_top_lane: ; CHECK: // %bb.0: ; CHECK-NEXT: fmlalt z0.h, z1.b, z2.b[3] _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits