https://github.com/jthackray created https://github.com/llvm/llvm-project/pull/119845
Add support for the following SME 8 bit floating-point dot-product intrinsics: * svdot_single_za16_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_single_za16_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_single_za32_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_single_za32_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za16_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za16_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za32_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za32_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); >From 6dcf15164f66467ad6e4cd6075d03e9f75331e5d Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <jonathan.thack...@arm.com> Date: Thu, 5 Dec 2024 21:13:46 +0000 Subject: [PATCH] [AArch64] Add intrinsics for SME FP8 FDOT single and multi instructions Add support for the following SME 8 bit floating-point dot-product intrinsics: * svdot_single_za16_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_single_za16_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_single_za32_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_single_za32_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za16_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za16_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za32_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); * svdot_za32_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm); Co-authored-by: Momchil Velikov <momchil.veli...@arm.com> Co-authored-by: Marian Lukac <marian.lu...@arm.com> --- clang/include/clang/Basic/arm_sme.td | 12 ++ .../sme2-intrinsics/acle_sme2_fp8_fdot.c | 174 +++++++++++++++++- .../acle_sme2_fp8_fdot.c | 16 ++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 36 ++++ .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 17 +- llvm/lib/Target/AArch64/SMEInstrFormats.td | 64 +++++++ .../AArch64/sme2-intrinsics-fp8-fdot.ll | 112 +++++++++++ 7 files changed, 415 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 0fae70866cd55e..e7625d8d3e0b8a 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -748,11 +748,23 @@ let SMETargetGuard = "sme2" in { let SMETargetGuard = "sme-f8f32" in { def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>; def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>; + + def SVDOT_SINGLE_FP8_ZA32_VG1x2 : Inst<"svdot[_single]_za32[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVDOT_SINGLE_FP8_ZA32_VG1x4 : Inst<"svdot[_single]_za32[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + + def SVDOT_MULTI_FP8_ZA32_VG1x2 : Inst<"svdot_za32[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVDOT_MULTI_FP8_ZA32_VG1x4 : Inst<"svdot_za32[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; } let SMETargetGuard = "sme-f8f16" in { def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; + + def SVDOT_SINGLE_FP8_ZA16_VG1x2 : Inst<"svdot[_single]_za16[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVDOT_SINGLE_FP8_ZA16_VG1x4 : Inst<"svdot[_single]_za16[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + + def SVDOT_MULTI_FP8_ZA16_VG1x2 : Inst<"svdot_za16[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; + def SVDOT_MULTI_FP8_ZA16_VG1x4 : Inst<"svdot_za16[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>; } //////////////////////////////////////////////////////////////////////////////// diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c index 74d18c32d5b3ab..a151d162e01085 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c @@ -1,18 +1,18 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: aarch64-registered-target -#include <arm_sme.h> // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + #include <arm_sme.h> #ifdef SVE_OVERLOADED_FORMS -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5 #else -#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5 #endif // CHECK-LABEL: define dso_local void @test_svdot_lane_za32_f8_vg1x2( @@ -32,7 +32,7 @@ void test_svdot_lane_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr); + SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x2_fpm,,)(slice, zn, zm, 3, fpmr); } // CHECK-LABEL: define dso_local void @test_svdot_lane_za32_f8_vg1x4( @@ -52,7 +52,7 @@ void test_svdot_lane_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, void test_svdot_lane_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr); + SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x4_fpm,,)(slice, zn, zm, 3, fpmr); } // CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x2( @@ -72,7 +72,7 @@ void test_svdot_lane_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn, void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr); + SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm,,)(slice, zn, zm, 3, fpmr); } // CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x4( @@ -92,5 +92,165 @@ void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, void test_svdot_lane_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") { - SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr); + SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm,,)(slice, zn, zm, 3, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_single_za32_f8_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za32_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, + svmfloat8_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,_single,_za32,_mf8,_vg1x2_fpm)(slice, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_single_za32_f8_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za32_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn, + svmfloat8_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,_single,_za32,_mf8,_vg1x4_fpm)(slice, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_multi_za32_f8_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za32_f8_vg1x2j13svmfloat8x2_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, + svmfloat8x2_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,,_za32,_mf8,_vg1x2_fpm) (slice, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_multi_za32_f8_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za32_f8_vg1x4j13svmfloat8x4_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn, + svmfloat8x4_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,,_za32,_mf8,_vg1x4_fpm)(slice, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_single_za16_f8_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za16_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, + svmfloat8_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,_single,_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_single_za16_f8_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za16_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_single_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn, + svmfloat8_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,_single,_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_multi_za16_f8_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za16_f8_vg1x2j13svmfloat8x2_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn, + svmfloat8x2_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,,_za16,_mf8,_vg1x2_fpm) (slice, zn, zm, fpmr); +} + +// CHECK-LABEL: define dso_local void @test_svdot_multi_za16_f8_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za16_f8_vg1x4j13svmfloat8x4_tS_m( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svdot_multi_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn, + svmfloat8x4_t zm, fpm_t fpmr) + __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svdot,,_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, fpmr); } diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fdot.c index 975f0b2e3dd853..bbcd0335ff555b 100644 --- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fdot.c +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fdot.c @@ -14,6 +14,22 @@ void test_features(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2, svdot_lane_za16_mf8_vg1x2_fpm(slice, f8x2, f8, 3, fpmr); // expected-error@+1 {{'svdot_lane_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}} svdot_lane_za16_mf8_vg1x4_fpm(slice, f8x4, f8, 3, fpmr); + // expected-error@+1 {{'svdot_single_za32_mf8_vg1x2_fpm' needs target feature sme,sme-f8f32}} + svdot_single_za32_mf8_vg1x2_fpm(slice, f8x2, f8, fpmr); + // expected-error@+1 {{'svdot_single_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}} + svdot_single_za32_mf8_vg1x4_fpm(slice, f8x4, f8, fpmr); + // expected-error@+1 {{'svdot_za32_mf8_vg1x2_fpm' needs target feature sme,sme-f8f32}} + svdot_za32_mf8_vg1x2_fpm(slice, f8x2, f8x2, fpmr); + // expected-error@+1 {{'svdot_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}} + svdot_za32_mf8_vg1x4_fpm(slice, f8x4, f8x4, fpmr); + // expected-error@+1 {{'svdot_single_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}} + svdot_single_za16_mf8_vg1x2_fpm(slice, f8x2, f8, fpmr); + // expected-error@+1 {{'svdot_single_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}} + svdot_single_za16_mf8_vg1x4_fpm(slice, f8x4, f8, fpmr); + // expected-error@+1 {{'svdot_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}} + svdot_za16_mf8_vg1x2_fpm(slice, f8x2, f8x2, fpmr); + // expected-error@+1 {{'svdot_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}} + svdot_za16_mf8_vg1x4_fpm(slice, f8x4, f8x4, fpmr); } void test_imm(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2, diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 654bc64a30bd89..5a7cce3bd08fa4 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3874,11 +3874,47 @@ class SME2_FP8_FDOT_LANE_VG1x4 : llvm_i32_ty], [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>; +class SME2_FP8_FDOT_SINGLE_VG1x2 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects]>; + +class SME2_FP8_FDOT_SINGLE_VG1x4 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects]>; + +class SME2_FP8_FDOT_MULTI_VG1x2 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects]>; + +class SME2_FP8_FDOT_MULTI_VG1x4 : + DefaultAttrsIntrinsic<[], [llvm_i32_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [IntrInaccessibleMemOnly, IntrHasSideEffects]>; + def int_aarch64_sme_fp8_fdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2; def int_aarch64_sme_fp8_fdot_lane_za16_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4; def int_aarch64_sme_fp8_fdot_lane_za32_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2; def int_aarch64_sme_fp8_fdot_lane_za32_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4; + + def int_aarch64_sme_fp8_fdot_single_za16_vg1x2 : SME2_FP8_FDOT_SINGLE_VG1x2; + def int_aarch64_sme_fp8_fdot_single_za16_vg1x4 : SME2_FP8_FDOT_SINGLE_VG1x4; + + def int_aarch64_sme_fp8_fdot_single_za32_vg1x2 : SME2_FP8_FDOT_SINGLE_VG1x2; + def int_aarch64_sme_fp8_fdot_single_za32_vg1x4 : SME2_FP8_FDOT_SINGLE_VG1x4; + + def int_aarch64_sme_fp8_fdot_multi_za16_vg1x2 : SME2_FP8_FDOT_MULTI_VG1x2; + def int_aarch64_sme_fp8_fdot_multi_za16_vg1x4 : SME2_FP8_FDOT_MULTI_VG1x4; + + def int_aarch64_sme_fp8_fdot_multi_za32_vg1x2 : SME2_FP8_FDOT_MULTI_VG1x2; + def int_aarch64_sme_fp8_fdot_multi_za32_vg1x4 : SME2_FP8_FDOT_MULTI_VG1x4; } // diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index fa577cf92e99d1..a9e083c7ec53c0 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -988,11 +988,11 @@ let Predicates = [HasSMEF8F16] in { defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>; defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>; defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>; -defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>; -defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>; -defm FDOT_VG2_M2Z2Z_BtoH : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>; -defm FDOT_VG4_M4Z4Z_BtoH : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>; +defm FDOT_VG2_M2ZZ_BtoH : sme2_fp8_fdot_single_vg1x2<"fdot", 0b0010001, MatrixOp16, int_aarch64_sme_fp8_fdot_single_za16_vg1x2>; +defm FDOT_VG4_M4ZZ_BtoH : sme2_fp8_fdot_single_vg1x4<"fdot", 0b0110001, MatrixOp16, int_aarch64_sme_fp8_fdot_single_za16_vg1x4>; +defm FDOT_VG2_M2Z2Z_BtoH : sme2_fp8_fdot_multi_vg1x2 <"fdot", 0b0100100, MatrixOp16, int_aarch64_sme_fp8_fdot_multi_za16_vg1x2>; +defm FDOT_VG4_M4Z4Z_BtoH : sme2_fp8_fdot_multi_vg1x4 <"fdot", 0b0100100, MatrixOp16, int_aarch64_sme_fp8_fdot_multi_za16_vg1x4>; def FMLAL_MZZI_BtoH : sme2_mla_ll_array_index_16b<"fmlal", 0b11, 0b00>; defm FMLAL_VG2_M2ZZI_BtoH : sme2_multi_vec_array_vg2_index_16b<"fmlal", 0b10, 0b111>; @@ -1011,11 +1011,10 @@ let Predicates = [HasSMEF8F32] in { defm FDOT_VG2_M2ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x2>; defm FDOT_VG4_M4ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x4>; -defm FDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010011, MatrixOp32, ZZ_b, ZPR4b8>; -defm FDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110011, MatrixOp32, ZZZZ_b, ZPR4b8>; - -defm FDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100110, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>; -defm FDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>; +defm FDOT_VG2_M2ZZ_BtoS : sme2_fp8_fdot_single_vg1x2<"fdot", 0b0010011, MatrixOp32, int_aarch64_sme_fp8_fdot_single_za32_vg1x2>; +defm FDOT_VG4_M4ZZ_BtoS : sme2_fp8_fdot_single_vg1x4<"fdot", 0b0110011, MatrixOp32, int_aarch64_sme_fp8_fdot_single_za32_vg1x4>; +defm FDOT_VG2_M2Z2Z_BtoS : sme2_fp8_fdot_multi_vg1x2 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x2>; +defm FDOT_VG4_M4Z4Z_BtoS : sme2_fp8_fdot_multi_vg1x4 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x4>; def FVDOTB_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdotb", 0b0>; def FVDOTT_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdott", 0b1>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 9f25749c83db83..000736b30b058b 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -5882,3 +5882,67 @@ multiclass sme2_fp8_fdot_index_za32_vg1x4<string mnemonic, def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexS32b_timm, tileslice16>; } + +multiclass sme2_fp8_fdot_single_vg1x2<string mnemonic, bits<7> op, + MatrixOperand matrix_op, + SDPatternOperator intrinsic> { + def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_op, ZZ_b, ZPR4b8, mnemonic>, + SMEPseudo2Instr<NAME, 1> { + let Uses=[FPMR, FPCR]; + } + + def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm", + (!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZ_b:$Zn, ZPR4b8:$Zm), 0>; + + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, ZZ_b, ZPR4b8, SMEMatrixArray>; + + def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, tileslice16>; +} + +multiclass sme2_fp8_fdot_single_vg1x4<string mnemonic, bits<7> op, + MatrixOperand matrix_op, + SDPatternOperator intrinsic> { + def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_op, ZZZZ_b, ZPR4b8, mnemonic>, + SMEPseudo2Instr<NAME, 1> { + let Uses=[FPMR, FPCR]; + } + + def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm", + (!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZZZ_b:$Zn, ZPR4b8:$Zm), 0>; + + def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b, ZPR4b8, SMEMatrixArray>; + + def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, tileslice16>; +} + +multiclass sme2_fp8_fdot_multi_vg1x2<string mnemonic, bits<7> op, + MatrixOperand matrix_op, + SDPatternOperator intrinsic> { + def NAME : sme2_dot_mla_add_sub_array_vg2_multi<op, matrix_op, ZZ_b_mul_r, mnemonic>, + SMEPseudo2Instr<NAME, 1> { + let Uses=[FPMR, FPCR]; + } + + def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm", + (!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZ_b_mul_r:$Zn, ZZ_b_mul_r:$Zm), 0>; + + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, ZZ_b_mul_r, SMEMatrixArray>; + + def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, nxv16i8, tileslice16>; +} + +multiclass sme2_fp8_fdot_multi_vg1x4<string mnemonic, bits<7> op, + MatrixOperand matrix_op, + SDPatternOperator intrinsic> { + def NAME : sme2_dot_mla_add_sub_array_vg4_multi<op, matrix_op, ZZZZ_b_mul_r, mnemonic>, + SMEPseudo2Instr<NAME, 1> { + let Uses=[FPMR, FPCR]; + } + + def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm", + (!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZZZ_b_mul_r:$Zn, ZZZZ_b_mul_r:$Zm), 0>; + + def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b_mul_r, SMEMatrixArray>; + + def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, nxv16i8, tileslice16>; +} diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll index 7fcbc328aa085e..d5783f09845869 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll @@ -56,4 +56,116 @@ define void @test_fdot32_1x4_indexed(i32 %slice.0, ret void } +define void @test_fdot32_1x2_single(i32 %slice.0, +; CHECK-LABEL: test_fdot32_1x2_single: +; CHECK: mov w8, w0 +; CHECK: fdot za.s[w8, 7, vgx2], { z0.b, z1.b }, z2.b +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x2(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm) + ret void +} + +define void @test_fdot32_1x4_single(i32 %slice.0, +; CHECK-LABEL: test_fdot32_1x4_single: +; CHECK: mov w8, w0 +; CHECK: fdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x4(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm) + ret void +} + +define void @test_fdot32_1x2_multi(i32 %slice.0, +; CHECK-LABEL: test_fdot32_1x2_multi: +; CHECK: mov w8, w0 +; CHECK: fdot za.s[w8, 7, vgx2], { z0.b, z1.b }, { z2.b, z3.b } +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x2(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) + ret void +} + +define void @test_fdot32_1x4_multi(i32 %slice.0, +; CHECK-LABEL: test_fdot32_1x4_multi: +; CHECK: mov w8, w0 +; CHECK: fdot za.s[w8, 7, vgx4], { z0.b - z3.b }, { z4.b - z7.b } +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x4(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) + ret void +} + +define void @test_fdot16_1x2_single(i32 %slice.0, +; CHECK-LABEL: test_fdot16_1x2_single: +; CHECK: mov w8, w0 +; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, z2.b +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x2(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm) + ret void +} + +define void @test_fdot16_1x4_single(i32 %slice.0, +; CHECK-LABEL: test_fdot16_1x4_single: +; CHECK: mov w8, w0 +; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, z4.b +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x4(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm) + ret void +} + +define void @test_fdot16_1x2_multi(i32 %slice.0, +; CHECK-LABEL: test_fdot16_1x2_multi: +; CHECK: mov w8, w0 +; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, { z2.b, z3.b } +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x2(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) + ret void +} + +define void @test_fdot16_1x4_multi(i32 %slice.0, +; CHECK-LABEL: test_fdot16_1x4_multi: +; CHECK: mov w8, w0 +; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, { z4.b - z7.b } +; CHECK: ret + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) #0 { + %slice = add i32 %slice.0, 7 + call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x4(i32 %slice, + <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, + <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) + ret void +} + attributes #0 = { "target-features" = "+sme,+sme-f8f32,+sme-f8f16" } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits