https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/91606
>From d3e381ac645d08b6f3b01283d47344556a163605 Mon Sep 17 00:00:00 2001 From: Momchil Velikov <momchil.veli...@arm.com> Date: Thu, 9 May 2024 15:56:31 +0100 Subject: [PATCH] [AArch64] Add intrinsics for multi-vector to ZA array vector accumulators (#88266) According to the specification in https://github.com/ARM-software/acle/pull/309 this adds the intrinsics void_svadd_za16_vg1x2_f16(uint32_t slice, svfloat16x2_t zn) __arm_streaming __arm_inout("za"); void_svadd_za16_vg1x4_f16(uint32_t slice, svfloat16x4_t zn) __arm_streaming __arm_inout("za"); void_svsub_za16_vg1x2_f16(uint32_t slice, svfloat16x2_t zn) __arm_streaming __arm_inout("za"); void_svsub_za16_vg1x4_f16(uint32_t slice, svfloat16x4_t zn) __arm_streaming __arm_inout("za"); as well as the corresponding `bf16` variants. --- clang/include/clang/Basic/arm_sme.td | 10 + .../acle_sme2_add_sub_za16.c | 193 ++++++++++++++++++ .../acle_sme2_add_sub_za16.c | 29 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 16 +- .../AArch64/sme2-intrinsics-add-sub-za16.ll | 148 ++++++++++++++ 6 files changed, 389 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 7808ee559932e..80e635e4a57ec 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -298,6 +298,16 @@ multiclass ZAAddSub<string n_suffix> { def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; } + + let TargetGuard = "sme-f16f16|sme-f8f16" in { + def NAME # _ZA16_VG1X2_F16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x2", "vm2", "h", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA16_VG1X4_F16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x4", "vm4", "h", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x4", [IsStreaming, IsInOutZA], []>; + } + + let TargetGuard = "sme2,b16b16" in { + def NAME # _ZA16_VG1X2_BF16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x2", "vm2", "b", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA16_VG1X4_BF16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x4", "vm4", "b", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x4", [IsStreaming, IsInOutZA], []>; + } } defm SVADD : ZAAddSub<"add">; diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c new file mode 100644 index 0000000000000..9a8aa448d3780 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c @@ -0,0 +1,193 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -o /dev/null + +// REQUIRES: aarch64-registered-target + +#include <arm_sme.h> + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x2_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svadd_za16_vg1x2_f16j13svfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x2_f16(uint32_t slice, svfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_f16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x4_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svadd_za16_vg1x4_f16j13svfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x4_f16(uint32_t slice, svfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_f16,_vg1x4)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x2_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svsub_za16_vg1x2_f16j13svfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x2_f16(uint32_t slice, svfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_f16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x4_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svsub_za16_vg1x4_f16j13svfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x4_f16(uint32_t slice, svfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_f16,_vg1x4)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x2_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svadd_za16_vg1x2_bf16j14svbfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x2_bf16(uint32_t slice, svbfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_bf16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x4_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svadd_za16_vg1x4_bf16j14svbfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x4_bf16(uint32_t slice, svbfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_bf16,_vg1x4)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x2_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svsub_za16_vg1x2_bf16j14svbfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x2_bf16(uint32_t slice, svbfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_bf16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x4_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svsub_za16_vg1x4_bf16j14svbfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x bfloat> [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 [[SLICE]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x4_bf16(uint32_t slice, svbfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_bf16,_vg1x4)(slice, zn); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c new file mode 100644 index 0000000000000..fb43bc6337086 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -emit-llvm %s + +// REQUIRES: aarch64-registered-target + +#include <arm_sme.h> + +void test_features(uint32_t slice, svfloat16x2_t zn2, svfloat16x4_t zn4, + svbfloat16x2_t bzn2, svbfloat16x4_t bzn4) __arm_streaming __arm_inout("za") { + // expected-error@+1 {{'svadd_za16_f16_vg1x2' needs target feature sme-f16f16|sme-f8f16}} + svadd_za16_f16_vg1x2(slice, zn2); + // expected-error@+1 {{'svadd_za16_f16_vg1x4' needs target feature sme-f16f16|sme-f8f16}} + svadd_za16_f16_vg1x4(slice, zn4); + // expected-error@+1 {{'svsub_za16_f16_vg1x2' needs target feature sme-f16f16|sme-f8f16}} + svsub_za16_f16_vg1x2(slice, zn2); + // expected-error@+1 {{'svsub_za16_f16_vg1x4' needs target feature sme-f16f16|sme-f8f16}} + svsub_za16_f16_vg1x4(slice, zn4); + + // expected-error@+1 {{'svadd_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svadd_za16_bf16_vg1x2(slice, bzn2); + // expected-error@+1 {{'svadd_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svadd_za16_bf16_vg1x4(slice, bzn4); + // expected-error@+1 {{'svsub_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svsub_za16_bf16_vg1x2(slice, bzn2); + // expected-error@+1 {{'svsub_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svsub_za16_bf16_vg1x4(slice, bzn4); +} + + + diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index e0630a6649dd7..4544cf35fb7b3 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3481,7 +3481,7 @@ let TargetPrefix = "aarch64" in { // Multi-vector add/sub and accumulate into ZA // foreach intr = ["add", "sub"] in { - foreach za = ["za32", "za64"] in { + foreach za = ["za16","za32", "za64"] in { def int_aarch64_sme_ # intr # _ # za # _vg1x2 : SME2_ZA_Write_VG2_Intrinsic; def int_aarch64_sme_ # intr # _ # za # _vg1x4 : SME2_ZA_Write_VG4_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index c5cbdce476ca1..2b70c4715bf9e 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -793,10 +793,10 @@ defm LUTI4_S_4ZTZI : sme2p1_luti4_vector_vg4_index<"luti4">; } let Predicates = [HasSMEF16F16orSMEF8F16] in { -defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; -defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; +defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_add_za16_vg1x2>; +defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_add_za16_vg1x4>; +defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_sub_za16_vg1x2>; +defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_sub_za16_vg1x4>; defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x2>; defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x4>; @@ -820,10 +820,10 @@ defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, nxv8f16, int_aarch } let Predicates = [HasSME2, HasB16B16] in { -defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x2>; +defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x4>; +defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_sub_za16_vg1x2>; +defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_sub_za16_vg1x4>; defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x2>; defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x4>; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll new file mode 100644 index 0000000000000..e7a6c0d6c549b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +define void @add_f16_vg1x2(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) #0 { +; CHECK-LABEL: add_f16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fadd za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: fadd za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) + ret void +} + +define void @add_f16_vg1x4(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, +; CHECK-LABEL: add_f16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fadd za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: fadd za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3) #1 { + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, + <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, + <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3); + ret void +} + +define void @sub_f16_vg1x2(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) #1 { +; CHECK-LABEL: sub_f16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fsub za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: fsub za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) + ret void +} + +define void @sub_f16_vg1x4(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, +; CHECK-LABEL: sub_f16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fsub za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: fsub za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3) #0 { + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, + <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, + <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3); + ret void +} + +define void @add_bf16_vg1x2(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) #2 { +; CHECK-LABEL: add_bf16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfadd za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: bfadd za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) + ret void +} + +define void @add_bf16_vg1x4(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, +; CHECK-LABEL: add_bf16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfadd za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: bfadd za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3) #2 { + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, + <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, + <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3); + ret void +} + +define void @sub_bf16_vg1x2(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) #2 { +; CHECK-LABEL: sub_bf16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfsub za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: bfsub za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) + ret void +} + +define void @sub_bf16_vg1x4(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, +; CHECK-LABEL: sub_bf16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfsub za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: bfsub za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3) #2 { + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, + <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, + <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3); + ret void +} + +attributes #0 = { nounwind "target-features"="+sme-f16f16" } +attributes #1 = { nounwind "target-features"="+sme-f8f16" } +attributes #2 = { nounwind "target-features"="+sme2,+bf16,+b16b16" } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits