Author: Jonathan Thackray Date: 2025-04-01T17:04:59+01:00 New Revision: 558ce50ebc31bbcd5ec5bfad0c0126adfde8bbb0
URL: https://github.com/llvm/llvm-project/commit/558ce50ebc31bbcd5ec5bfad0c0126adfde8bbb0 DIFF: https://github.com/llvm/llvm-project/commit/558ce50ebc31bbcd5ec5bfad0c0126adfde8bbb0.diff LOG: [Clang][LLVM] Implement multi-single vectors MOP4{A/S} (#129226) Implement all multi-single {BF/F/S/U/SU/US}MOP4{A/S} instructions in clang and llvm following the ACLE in https://github.com/ARM-software/acle/pull/381/files Added: clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll Modified: clang/include/clang/Basic/arm_sme.td clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/SMEInstrFormats.td Removed: ################################################################################ diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 6312223f5d112..3958ed70f6ad0 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -295,6 +295,7 @@ defm SVMOPS : ZAFPOuterProd<"mops">; multiclass MOP4<string mode, string za, string t, string i, list<ImmCheck> checks> { def _1x1 : Inst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{d}]", "vidd", t, MergeNone, i # "_1x1", [IsInOutZA, IsStreaming], checks>; def _1x2 : Inst<"svmop4" # mode # "[_1x2]" # za # "[_{d}_{d}]", "vid2", t, MergeNone, i # "_1x2", [IsInOutZA, IsStreaming], checks>; + def _2x1 : Inst<"svmop4" # mode # "[_2x1]" # za # "[_{d}_{d}]", "vi2d", t, MergeNone, i # "_2x1", [IsInOutZA, IsStreaming], checks>; } let SMETargetGuard = "sme2,sme-mop4" in { @@ -350,6 +351,10 @@ multiclass SUMOP4<string mode, string za, string t, string i, list<ImmCheck> che "vid2.u", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_1x2", [IsStreaming, IsInOutZA], checks>; + def _2x1 : SInst<"svmop4" # mode # "[_2x1]" # za # "[_{d}_{3}]", + "vi2u", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_2x1", + [IsStreaming, IsInOutZA], + checks>; } multiclass USMOP4<string mode, string za, string t, string i, list<ImmCheck> checks> { @@ -361,6 +366,10 @@ multiclass USMOP4<string mode, string za, string t, string i, list<ImmCheck> che "vid2.x", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_1x2", [IsStreaming, IsInOutZA], checks>; + def _2x1 : SInst<"svmop4" # mode # "[_2x1]" # za # "[_{d}_{3}]", + "vi2x", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_2x1", + [IsStreaming, IsInOutZA], + checks>; } let SMETargetGuard = "sme2,sme-mop4" in { diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c new file mode 100644 index 0000000000000..e42ed95b9b52c --- /dev/null +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c @@ -0,0 +1,304 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include <arm_sme.h> + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3, A4_UNUSED) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmop4a_2x1_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_s8_s8(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_s8_s8(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_u8_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_u8_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_s8_u8(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_s8_u8(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_u8_s8(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.2x1.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_u8_s8(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_s16_u16(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_s16_u16(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_u16_s16(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_u16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.2x1.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_u16_s16(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_u16_s16)(1, zn, zm); +} + + +// CHECK-LABEL: @test_svmop4a_2x1_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_f32_f32(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv4f32(i32 1, <vscale x 4 x float> [[ZN_COERCE0:%.*]], <vscale x 4 x float> [[ZN_COERCE1:%.*]], <vscale x 4 x float> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_f32_f32(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_f64_f64(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv2f64(i32 1, <vscale x 2 x double> [[ZN_COERCE0:%.*]], <vscale x 2 x double> [[ZN_COERCE1:%.*]], <vscale x 2 x double> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_f64_f64(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za16,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za16,_bf16_bf16)(1, zn, zm); +} diff --git a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp index 47ce2a0f5f80f..f8e57e9b24332 100644 --- a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp @@ -166,3 +166,87 @@ void tests_mop4_imm_f64_f64_1x2(svfloat64_t zn, svfloat64x2_t zm) __arm_streamin svmop4s_1x2_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} return; } + +void tests_mop4_imm_s8_s8_2x1(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_u8_2x1(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s8_u8_2x1(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_s8_2x1(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_u8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_u8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s16_s16_2x1(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_u16_2x1(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_s16_u16_2x1(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_s16_2x1(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za64_u16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_u16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_f16_f16_2x1(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_2x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_bf16_bf16_2x1(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_2x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_f32_f32_2x1(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_f64_f64_2x1(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index f08bdf78b5f96..6c25e6582b836 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3083,6 +3083,7 @@ let TargetPrefix = "aarch64" in { foreach ty = ["s", "u", "su", "us"] in { def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x1" : SME_OuterProduct_QuarterTile_Single_Single; def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; + def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_2x1" : SME_OuterProduct_QuarterTile_Single_Multi; } } } @@ -3092,9 +3093,10 @@ let TargetPrefix = "aarch64" in { foreach wide = ["", "_wide"] in { def int_aarch64_sme_mop4 # mode # wide # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single; def int_aarch64_sme_mop4 # mode # wide # "_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; + def int_aarch64_sme_mop4 # mode # wide # "_2x1" : SME_OuterProduct_QuarterTile_Single_Multi; } } - + class SME_AddVectorToTile_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 87a8f068083d5..ccc061da0be9a 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -273,6 +273,11 @@ class SME2_ZA_Tile_Vec_Single_Single_Pat<string name, SDPatternOperator intrinsi class SME2_ZA_Tile_Vec_Multi_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType vt> : Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm1, vt:$Zm2), (!cast<Instruction>(name # _PSEUDO) $tile, $Zn, (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>; + +class SME2_ZA_Tile_Vec_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand imm_ty, ValueType vt> + : Pat<(intrinsic imm_ty:$tile, vt:$Zn1, vt:$Zn2, vt:$Zm), + (!cast<Instruction>(name # _PSEUDO) $tile, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), $Zm)>; + //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -616,6 +621,7 @@ class sme_quarter_outer_product_i16_i32<bit u0, bit N, bit M, bit subtr, Registe } multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{ + // Single vectors def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr, ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_BToS, 1>; @@ -623,8 +629,15 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_BToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv16i8>; + // Multiple and single vectors def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr, - ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>; + ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _M2ZZ_BToS, 1>; + + def NAME # _M2ZZ_BToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_BToS, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_BToS, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv16i8>; + + // Single and multiple vectors def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr, ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_BToS, 1>; @@ -632,11 +645,13 @@ multiclass sme_quarter_outer_product_i8_i32<bit zn_u, bit zm_u, bit subtr, strin def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_BToS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv16i8>; + // Multiple vectors def _M2Z2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 1}, subtr, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>; } multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mnemonic, string op>{ + // Single vectors def _MZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b0, subtr, ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HToS, 1>; @@ -644,8 +659,15 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_HToS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8i16>; + // Multiple and single vectors def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b0, subtr, - ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>; + ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _M2ZZ_HToS, 1>; + + def NAME # _M2ZZ_HToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_HToS, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_HToS, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv8i16>; + + // Single and multiple vectors def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b0, 0b1, subtr, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_HToS, 1>; @@ -653,11 +675,13 @@ multiclass sme_quarter_outer_product_i16_i32<bit unsigned, bit subtr, string mne def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HToS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8i16>; + // Multiple vectors def _M2Z2Z_HToS : sme_quarter_outer_product_i16_i32<unsigned, 0b1, 0b1, subtr, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>; } multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string mnemonic, string op>{ + // Single vectors def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr, ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZZ_HtoD, 1>; @@ -665,8 +689,15 @@ multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string m def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_HtoD, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv8i16>; + // Multiple and single vectors def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr, - ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>; + ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr<NAME # _M2ZZ_HtoD, 1>; + + def NAME # _M2ZZ_HtoD_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _M2ZZ_HtoD, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_HtoD, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_7, nxv8i16>; + + // Single and multiple vectors def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr<NAME # _MZ2Z_HtoD, 1>; @@ -674,6 +705,7 @@ multiclass sme_quarter_outer_product_i64<bit zn_u, bit zm_u, bit subtr, string m def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_HtoD, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_7, nxv8i16>; + // Multiple vectors def _M2Z2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 1}, subtr, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>; } @@ -5524,7 +5556,11 @@ multiclass sme2_bfmop4as_widening<bit S, string mnemonic, string op> { def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8bf16>; // Multiple and single vectors - def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_S, 1>; + + def NAME # _M2ZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_S, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_S, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv8bf16>; // Single and multiple vectors def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_S, 1>; @@ -5533,7 +5569,6 @@ multiclass sme2_bfmop4as_widening<bit S, string mnemonic, string op> { def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_S, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv8bf16>; - // Multiple vectors def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; } @@ -5680,7 +5715,11 @@ multiclass sme2_fmop4as_fp16_non_widening<bit S, string mnemonic, string op> { def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8f16>; // Multiple and single vectors - def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_H, 1>; + + def NAME # _M2ZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _M2ZZ_H, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_H, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_1, nxv8f16>; // Single and multiple vectors def _MZ2Z_H : sme2_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_H, 1>; @@ -5760,7 +5799,11 @@ multiclass sme2_bfmop4as_non_widening<bit S, string mnemonic, string op> { def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_H, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv8bf16>; // Multiple and single vectors - def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_H, 1>; + + def NAME # _M2ZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _M2ZZ_H, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_H, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_1, nxv8bf16>; // Single and multiple vectors def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_H, 1>; @@ -5805,7 +5848,11 @@ multiclass sme2_fmop4as_fp32_non_widening<bit S, string mnemonic, string op> { def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_S, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv4f32>; // Multiple and single vectors - def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>; + def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_S, 1>; + + def NAME # _M2ZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_s_mul_r_Lo, ZPR32Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_S, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_S, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv4f32>; // Single and multiple vectors def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_S, 1>; @@ -5850,7 +5897,11 @@ multiclass sme2_fmop4as_fp64_non_widening<bit S, string mnemonic, string op> { def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_D, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_7, nxv2f64>; // Multiple and single vectors - def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>; + def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_D, 1>; + + def NAME # _M2ZZ_D_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_d_mul_r_Lo, ZPR64Mul2_Hi, SMEMatrixTileD>, SMEPseudo2Instr<NAME # _M2ZZ_D, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_D, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_7, nxv2f64>; // Single and multiple vectors def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_D, 1>; @@ -5895,7 +5946,11 @@ multiclass sme2_fmop4as_fp16_fp32_widening<bit S, string mnemonic, string op> { def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_HtoS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv8f16>; // Multiple and single vectors - def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_HtoS, 1>; + + def NAME # _M2ZZ_HtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_HtoS, 0>; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_HtoS, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv8f16>; // Single and multiple vectors def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_HtoS, 1>; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll new file mode 100644 index 0000000000000..ef1536fae6496 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll @@ -0,0 +1,393 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +; Widening +define void @mop4a_za32_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4a_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + +define void @mop4s_za32_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4s_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + +define void @mop4a_za32_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4a_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + +define void @mop4s_za32_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4s_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + +define void @mop4a_za32_s8_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4a_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + +define void @mop4s_za32_s8_u8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4s_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + +define void @mop4a_za32_u8_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4a_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + +define void @mop4s_za32_u8_s8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 { +; CHECK-LABEL: mop4s_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.wide.2x1.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) + ret void +} + + +define void @mop4a_za32_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4a_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4s_za32_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4s_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4a_za32_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4a_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4s_za32_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4s_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4a_za32_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 { +; CHECK-LABEL: mop4a_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) + ret void +} + +define void @mop4s_za32_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 { +; CHECK-LABEL: mop4s_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) + ret void +} + +define void @mop4a_za32_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 { +; CHECK-LABEL: mop4a_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) + ret void +} + +define void @mop4s_za32_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 { +; CHECK-LABEL: mop4s_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) + ret void +} + +define void @mop4a_za64_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4a_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4s_za64_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4s_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4a_za64_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4a_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4s_za64_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4s_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4a_za64_s16_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4a_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4s_za64_s16_u16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4s_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4a_za64_u16_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4a_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +define void @mop4s_za64_u16_s16(<vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) #0 { +; CHECK-LABEL: mop4s_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.za64.wide.2x1.nxv8i16(i32 0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) + ret void +} + +; Non-widening +define void @mop4a_za16_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 { +; CHECK-LABEL: mop4a_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) + ret void +} + +define void @mop4s_za16_f16(<vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) #0 { +; CHECK-LABEL: mop4s_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv8f16(i32 0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) + ret void +} + +define void @mop4a_za32_f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm) #0 { +; CHECK-LABEL: mop4a_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.s, { z0.s, z1.s }, z24.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv4f32(i32 0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm) + ret void +} + +define void @mop4s_za32_f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm) #0 { +; CHECK-LABEL: mop4s_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.s, { z0.s, z1.s }, z24.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv4f32(i32 0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zm) + ret void +} + +define void @mop4a_za64_f64(<vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm) #0 { +; CHECK-LABEL: mop4a_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.d, { z0.d, z1.d }, z24.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv2f64(i32 0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm) + ret void +} + +define void @mop4s_za64_f64(<vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm) #0 { +; CHECK-LABEL: mop4s_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.d, { z0.d, z1.d }, z24.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv2f64(i32 0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zm) + ret void +} + +define void @mop4a_za16_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 { +; CHECK-LABEL: mop4a_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4a za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) + ret void +} + +define void @mop4s_za16_bf16(<vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) #0 { +; CHECK-LABEL: mop4s_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv8bf16(i32 0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) + ret void +} + +attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits