https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/97058
>From 4a6c4033f7deddcd4094ebde81402960de85bd80 Mon Sep 17 00:00:00 2001 From: Marian Lukac <marian.lu...@arm.com> Date: Fri, 28 Jun 2024 10:13:16 +0000 Subject: [PATCH 1/3] [AARCH64][SVE] Add intrinsics for SVE LUTI instructions --- clang/include/clang/Basic/arm_sve.td | 21 +- .../aarch64-sve2-intrinsics/acle_sve2_luti.c | 336 ++++++++++++++++++ .../acle_sve2_imm_lane.cpp | 32 ++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 20 ++ llvm/lib/Target/AArch64/SVEInstrFormats.td | 37 +- .../CodeGen/AArch64/sve2-intrinsics-luti.ll | 107 ++++++ 6 files changed, 551 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c create mode 100644 llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 94c093d8911562..dc999a5bbb3d88 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1939,6 +1939,25 @@ def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [VerifyRunti def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>; } + +//////////////////////////////////////////////////////////////////////////////// +// SVE2 - Lookup table +let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in { + def SVLUTI2_B : SInst<"svluti2_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI2_H : SInst<"svluti2_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti2_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>; + + def SVLUTI4_B : SInst<"svluti4_lane[_{d}]", "dd[i", "cUc", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>; + def SVLUTI4_H : SInst<"svluti4_lane[_{d}]", "dd[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; + + def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "sUsh", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; +} + +let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in { + def SVLUTI2_BF16 : SInst<"svluti2_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti2_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_7>]>; + def SVLUTI4_BF16 : SInst<"svluti4_lane[_{d}]", "dd[i", "b", MergeNone, "aarch64_sve_luti4_lane", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI4_BF16_x2 : SInst<"svluti4_lane[_{d}]_x2", "d2.d[i", "b", MergeNone, "aarch64_sve_luti4_lane_x2", [ VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; +} + //////////////////////////////////////////////////////////////////////////////// // SVE2 - Optional @@ -2384,4 +2403,4 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { def SVBFMLSLB_LANE : SInst<"svbfmlslb_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslb_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>; def SVBFMLSLT_LANE : SInst<"svbfmlslt_lane[_{d}]", "dd$$i", "f", MergeNone, "aarch64_sve_bfmlslt_lane", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<3, ImmCheck0_7>]>; -} +} \ No newline at end of file diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c new file mode 100644 index 00000000000000..d19246cba2d379 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c @@ -0,0 +1,336 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +#include <arm_sve.h> + +#if defined __ARM_FEATURE_SME +#define MODE_ATTR __arm_streaming +#else +#define MODE_ATTR +#endif + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// SME-CHECK-LABEL: @test_svluti2_lane_s8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_s8u10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svint8_t test_svluti2_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_s8,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_u8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3) +// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti2_lane_u8u11__SVUint8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svuint8_t test_svluti2_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_u8,)(table, indices, 3); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_s16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_s16u11__SVInt16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +svint16_t test_svluti2_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_s16,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_u16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7) +// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_u16u12__SVUint16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +svuint16_t test_svluti2_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_u16,)(table, indices, 7); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_f16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5) +// SME-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti2_lane_f16u13__SVFloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5) +// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svluti2_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_f16,)(table, indices, 5); +} + +// SME-CHECK-LABEL: @test_svluti2_lane_bf16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// SME-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// CHECK-LABEL: @test_svluti2_lane_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svluti2_lane_bf16u14__SVBfloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +svbfloat16_t test_svluti2_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti2_lane,_bf16,)(table, indices, 2); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_s8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_s8u10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svint8_t test_svluti4_lane_s8(svint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_s8,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_u8( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1) +// SME-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z20test_svluti4_lane_u8u11__SVUint8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svuint8_t test_svluti4_lane_u8(svuint8_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_u8,)(table, indices, 1); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_s16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_s16u11__SVInt16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +svint16_t test_svluti4_lane_s16(svint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_s16,)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_u16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7) +// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_u16u12__SVUint16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +svuint16_t test_svluti4_lane_u16(svuint16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_u16,)(table, indices, 3); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_f16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5) +// SME-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z21test_svluti4_lane_f16u13__SVFloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svluti4_lane_f16(svfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_f16,)(table, indices, 2); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_bf16( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// SME-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// CHECK-LABEL: @test_svluti4_lane_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1) +// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z22test_svluti4_lane_bf16u14__SVBfloat16_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> [[TABLE:%.*]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1) +// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +svbfloat16_t test_svluti4_lane_bf16(svbfloat16_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_bf16,)(table, indices, 1); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_s16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_s16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_s16_x211svint16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 0) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]] +// +svint16_t test_svluti4_lane_s16_x2(svint16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_s16,_x2)(table, indices, 0); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_u16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 7) +// SME-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_u16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_u16_x212svuint16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 3) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP2]] +// +svuint16_t test_svluti4_lane_u16_x2(svuint16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_u16,_x2)(table, indices, 3); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_f16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 5) +// SME-CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z24test_svluti4_lane_f16_x213svfloat16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]] +// +svfloat16_t test_svluti4_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_f16,_x2)(table, indices, 2); +} + +// SME-CHECK-LABEL: @test_svluti4_lane_bf16_x2( +// SME-CHECK-NEXT: entry: +// SME-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0) +// SME-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8) +// SME-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 2) +// SME-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]] +// CHECK-LABEL: @test_svluti4_lane_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1) +// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]] +// +// CPP-CHECK-LABEL: @_Z25test_svluti4_lane_bf16_x214svbfloat16x2_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE:%.*]], i64 0) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TABLE]], i64 8) +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 16 x i8> [[INDICES:%.*]], i32 1) +// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP2]] +// +svbfloat16_t test_svluti4_lane_bf16_x2(svbfloat16x2_t table, svuint8_t indices) MODE_ATTR{ + return SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2)(table, indices, 1); +} diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp index bca063385420a4..e405077b3de939 100644 --- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp +++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_imm_lane.cpp @@ -78,6 +78,14 @@ void test_range_0_7() SVE_ACLE_FUNC(svqrdmlsh_lane,_s16,,)(svundef_s16(), svundef_s16(), svundef_s16(), -1); // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} SVE_ACLE_FUNC(svqrdmulh_lane,_s16,,)(svundef_s16(), svundef_s16(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_s16,,)(svundef_s16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_u16,,)(svundef_u16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_f16,,)(svundef_f16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 7]}} + SVE_ACLE_FUNC(svluti2_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1); } void test_range_0_3() @@ -146,6 +154,26 @@ void test_range_0_3() SVE_ACLE_FUNC(svqdmullb_lane,_s64,,)(svundef_s32(), svundef_s32(), 4); // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} SVE_ACLE_FUNC(svqdmullt_lane,_s64,,)(svundef_s32(), svundef_s32(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti2_lane,_s8,,)(svundef_s8(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti2_lane,_u8,,)(svundef_u8(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_s16,,)(svundef_s16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_u16,,)(svundef_u16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_f16,,)(svundef_f16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_bf16,,)(svundef_bf16(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_s16,_x2,)(svcreate2_s16(svundef_s16(),svundef_s16()), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_u16,_x2,)(svcreate2_u16(svundef_u16(),svundef_u16()), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_f16,_x2,)(svcreate2_f16(svundef_f16(),svundef_f16()), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 3]}} + SVE_ACLE_FUNC(svluti4_lane,_bf16,_x2,)(svcreate2_bf16(svundef_bf16(),svundef_bf16()), svundef_u8(), -1); } void test_range_0_1() @@ -180,4 +208,8 @@ void test_range_0_1() SVE_ACLE_FUNC(svqrdmlsh_lane,_s64,,)(svundef_s64(), svundef_s64(), svundef_s64(), 2); // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} SVE_ACLE_FUNC(svqrdmulh_lane,_s64,,)(svundef_s64(), svundef_s64(), 2); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} + SVE_ACLE_FUNC(svluti4_lane,_s8,,)(svundef_s8(), svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} + SVE_ACLE_FUNC(svluti4_lane,_u8,,)(svundef_u8(), svundef_u8(), -1); } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 38d71b17b476d5..a443d6858b2a0d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1268,6 +1268,13 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; + class SVE2_LUTI_Inrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + class SVE2_1VectorArg_Long_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, @@ -2662,6 +2669,19 @@ def int_aarch64_sve_sm4ekey : ClangBuiltin<"__builtin_sve_svsm4ekey_u32">, def int_aarch64_sve_tbl2 : AdvSIMD_SVE2_TBX_Intrinsic; def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic; +// +// SVE2 - Lookup Table +// + +def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic; +def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic; +def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<3>>]>; + // // SVE2 - Optional bit permutation // diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index fc7d3cdda4acd5..c87d0746bc6a17 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10349,6 +10349,16 @@ multiclass sve2_luti2_vector_index<string mnemonic> { let Inst{23-22} = idx{2-1}; let Inst{12} = idx{0}; } + + def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti2_lane, nxv16i8, nxv16i8, + i32, timm32_0_3, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti2_lane, nxv8i16, nxv16i8, + i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti2_lane, nxv8f16, nxv16i8, + i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8, + i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; + } // FP8 Look up table read with 4-bit indices @@ -10361,14 +10371,39 @@ multiclass sve2_luti4_vector_index<string mnemonic> { bits<2> idx; let Inst{23-22} = idx; } + + def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti4_lane, nxv16i8, nxv16i8, + i32, timm32_0_1, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti4_lane, nxv8i16, nxv16i8, + i32, timm32_0_3, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti4_lane, nxv8f16, nxv16i8, + i32, timm32_0_3, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti4_lane, nxv8bf16, nxv16i8, + i32, timm32_0_3, !cast<Instruction>(NAME # _H)>; } // FP8 Look up table read with 4-bit indices (two contiguous registers) multiclass sve2_luti4_vector_vg2_index<string mnemonic> { - def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> { + def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> { bits<2> idx; let Inst{23-22} = idx; } + + def : Pat<(nxv8i16 (int_aarch64_sve_luti4_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), + (nxv8i16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0, + nxv8i16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_3:$Op4))>; + def : Pat<(nxv8f16 (int_aarch64_sve_luti4_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), + (nxv8f16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0, + nxv8f16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_3:$Op4))>; + def : Pat<(nxv8bf16 (int_aarch64_sve_luti4_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), + (nxv8bf16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, + nxv8bf16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_3:$Op4))>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll new file mode 100644 index 00000000000000..5cea7536e1f3ca --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+sve2,+lut,+bf16 | FileCheck %s + +define <vscale x 16 x i8> @test_luti2_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti2_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.b, { z0.b }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti2.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 16 x i8> %res +} + +define <vscale x 8 x i16> @test_luti2_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti2_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti2.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x half> @test_luti2_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti2_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti2.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x half> %res +} + +define <vscale x 8 x bfloat> @test_luti2_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti2_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti2.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x bfloat> %res +} + +define <vscale x 16 x i8> @test_luti4_lane_i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti4_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.b, { z0.b }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti4.lane.nxv16i8(<vscale x 16 x i8> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 16 x i8> %res +} + +define <vscale x 8 x i16> @test_luti4_lane_i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti4_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.nxv8i16(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x half> @test_luti4_lane_f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti4_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.nxv8f16(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x half> %res +} + +define <vscale x 8 x bfloat> @test_luti4_lane_bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti4_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 z0.h, { z0.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x bfloat> %res +} + +define <vscale x 8 x i16> @test_luti4_lane_i16_x2(<vscale x 8 x i16> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti4_lane_i16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> %table, <vscale x 8 x i16> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x half> @test_luti4_lane_f16_x2(<vscale x 8 x half> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti4_lane_f16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> %table, <vscale x 8 x half> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x half> %res +} + +define <vscale x 8 x bfloat> @test_luti4_lane_bf16_x2(<vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices){ +; CHECK-LABEL: test_luti4_lane_bf16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: luti4 z0.h, { z2.h, z3.h }, z1[0] +; CHECK-NEXT: ret + %res= tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> %table, <vscale x 8 x bfloat> %table, <vscale x 16 x i8> %indices, i32 0) + ret <vscale x 8 x bfloat> %res +} >From 160a73b17e409aff972d98e3ddeab438828e88ae Mon Sep 17 00:00:00 2001 From: Marian Lukac <marian.lu...@arm.com> Date: Thu, 4 Jul 2024 13:00:00 +0000 Subject: [PATCH 2/3] Fixes --- clang/include/clang/Basic/arm_sve.td | 1 - clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c | 1 + llvm/lib/Target/AArch64/SVEInstrFormats.td | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index dc999a5bbb3d88..d152b53c738604 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1939,7 +1939,6 @@ def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u", "b", MergeNone, "", [VerifyRunti def SVTBX_BF16 : SInst<"svtbx[_{d}]", "dddu", "b", MergeNone, "aarch64_sve_tbx", [VerifyRuntimeMode]>; } - //////////////////////////////////////////////////////////////////////////////// // SVE2 - Lookup table let SVETargetGuard = "sve2,lut", SMETargetGuard = "sme2,lut" in { diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c index d19246cba2d379..60c4828c407e8e 100644 --- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_luti.c @@ -8,6 +8,7 @@ // RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu \ // RUN: -target-feature +sve -target-feature +sve2 -target-feature +lut -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +lut -target-feature +bf16 -O1 -Werror -Wall -o /dev/null %s #include <arm_sve.h> #if defined __ARM_FEATURE_SME diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index c87d0746bc6a17..71cab1d41b31d6 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10358,7 +10358,6 @@ multiclass sve2_luti2_vector_index<string mnemonic> { i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8, i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; - } // FP8 Look up table read with 4-bit indices >From f447ecae0d8ef10570830e00b67bb4d1df1774e3 Mon Sep 17 00:00:00 2001 From: Marian Lukac <marian.lu...@arm.com> Date: Thu, 11 Jul 2024 11:18:06 +0000 Subject: [PATCH 3/3] Removed trailing whitespace --- llvm/lib/Target/AArch64/SVEInstrFormats.td | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 71cab1d41b31d6..06d837c5f2cee2 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10350,13 +10350,13 @@ multiclass sve2_luti2_vector_index<string mnemonic> { let Inst{12} = idx{0}; } - def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti2_lane, nxv16i8, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti2_lane, nxv16i8, nxv16i8, i32, timm32_0_3, !cast<Instruction>(NAME # _B)>; - def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti2_lane, nxv8i16, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti2_lane, nxv8i16, nxv16i8, i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti2_lane, nxv8f16, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti2_lane, nxv8f16, nxv16i8, i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti2_lane, nxv8bf16, nxv16i8, i32, timm32_0_7, !cast<Instruction>(NAME # _H)>; } @@ -10371,13 +10371,13 @@ multiclass sve2_luti4_vector_index<string mnemonic> { let Inst{23-22} = idx; } - def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti4_lane, nxv16i8, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv16i8, int_aarch64_sve_luti4_lane, nxv16i8, nxv16i8, i32, timm32_0_1, !cast<Instruction>(NAME # _B)>; - def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti4_lane, nxv8i16, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv8i16, int_aarch64_sve_luti4_lane, nxv8i16, nxv16i8, i32, timm32_0_3, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti4_lane, nxv8f16, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv8f16, int_aarch64_sve_luti4_lane, nxv8f16, nxv16i8, i32, timm32_0_3, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti4_lane, nxv8bf16, nxv16i8, + def : SVE_3_Op_Imm_Pat<nxv8bf16, int_aarch64_sve_luti4_lane, nxv8bf16, nxv16i8, i32, timm32_0_3, !cast<Instruction>(NAME # _H)>; } @@ -10388,17 +10388,17 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> { let Inst{23-22} = idx; } - def : Pat<(nxv8i16 (int_aarch64_sve_luti4_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2, + def : Pat<(nxv8i16 (int_aarch64_sve_luti4_lane_x2 nxv8i16:$Op1, nxv8i16:$Op2, nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), (nxv8i16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0, nxv8i16:$Op2, zsub1), nxv16i8:$Op3, timm32_0_3:$Op4))>; - def : Pat<(nxv8f16 (int_aarch64_sve_luti4_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2, + def : Pat<(nxv8f16 (int_aarch64_sve_luti4_lane_x2 nxv8f16:$Op1, nxv8f16:$Op2, nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), (nxv8f16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0, nxv8f16:$Op2, zsub1), nxv16i8:$Op3, timm32_0_3:$Op4))>; - def : Pat<(nxv8bf16 (int_aarch64_sve_luti4_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2, + def : Pat<(nxv8bf16 (int_aarch64_sve_luti4_lane_x2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv16i8:$Op3, (i32 timm32_0_3:$Op4))), (nxv8bf16 (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1), _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits