https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/96883
>From cb2ebe232013576f57f8f26b9156fccd75d7d38f Mon Sep 17 00:00:00 2001 From: Marian Lukac <marian.lu...@arm.com> Date: Thu, 27 Jun 2024 09:38:17 +0000 Subject: [PATCH 1/2] [AArch64][NEON] Add intrinsics for LUTI --- clang/include/clang/Basic/arm_neon.td | 16 + clang/lib/CodeGen/CGBuiltin.cpp | 54 +++ clang/test/CodeGen/aarch64-neon-luti.c | 433 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 19 + .../lib/Target/AArch64/AArch64InstrFormats.td | 14 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 70 +++ llvm/test/CodeGen/AArch64/neon-luti.ll | 207 +++++++++ 7 files changed, 806 insertions(+), 7 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-neon-luti.c create mode 100644 llvm/test/CodeGen/AArch64/neon-luti.ll diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 6390ba3f9fe5e..0dd76ce32fc20 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2096,3 +2096,19 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">; def VSTL1_LANE : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">; } + +//Lookup table read with 2-bit/4-bit indices +let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { + def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcQcQUcQPc">; + def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcQcQUcQPc">; + def VLUTI2_H : SInst<"vluti2_lane", "Q.(qU<)I", "sUsPshQsQUsQPsQh">; + def VLUTI2_H_Q : SInst<"vluti2_laneq", "Q.(QU<)I", "sUsPshQsQUsQPsQh">; + def VLUTI4_B : SInst<"vluti4_laneq", "..UI", "QcQUcQPc">; + def VLUTI4_H_X2 : SInst<"vluti4_laneq_x2", ".2(U<)I", "QsQUsQPsQh">; + + let ArchGuard = "defined(__aarch64__)", TargetGuard= "lut,bf16" in { + def VLUTI2_BF : SInst<"vluti2_lane", "Q.(qU<)I", "bQb">; + def VLUTI2_BF_Q : SInst<"vluti2_laneq", "Q.(QU<)I", "bQb">; + def VLUTI4_BF_X2 : SInst<"vluti4_laneq_x2", ".2(U<)I", "Qb">; + } +} diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 511e1fd4016d7..f9ac6c9dc8504 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13357,6 +13357,60 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_suqadd; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); } + + case NEON::BI__builtin_neon_vluti2_lane_bf16: + case NEON::BI__builtin_neon_vluti2_lane_f16: + case NEON::BI__builtin_neon_vluti2_lane_p16: + case NEON::BI__builtin_neon_vluti2_lane_p8: + case NEON::BI__builtin_neon_vluti2_lane_s16: + case NEON::BI__builtin_neon_vluti2_lane_s8: + case NEON::BI__builtin_neon_vluti2_lane_u16: + case NEON::BI__builtin_neon_vluti2_lane_u8: + case NEON::BI__builtin_neon_vluti2_laneq_bf16: + case NEON::BI__builtin_neon_vluti2_laneq_f16: + case NEON::BI__builtin_neon_vluti2_laneq_p16: + case NEON::BI__builtin_neon_vluti2_laneq_p8: + case NEON::BI__builtin_neon_vluti2_laneq_s16: + case NEON::BI__builtin_neon_vluti2_laneq_s8: + case NEON::BI__builtin_neon_vluti2_laneq_u16: + case NEON::BI__builtin_neon_vluti2_laneq_u8: + case NEON::BI__builtin_neon_vluti2q_lane_bf16: + case NEON::BI__builtin_neon_vluti2q_lane_f16: + case NEON::BI__builtin_neon_vluti2q_lane_p16: + case NEON::BI__builtin_neon_vluti2q_lane_p8: + case NEON::BI__builtin_neon_vluti2q_lane_s16: + case NEON::BI__builtin_neon_vluti2q_lane_s8: + case NEON::BI__builtin_neon_vluti2q_lane_u16: + case NEON::BI__builtin_neon_vluti2q_lane_u8: + case NEON::BI__builtin_neon_vluti2q_laneq_bf16: + case NEON::BI__builtin_neon_vluti2q_laneq_f16: + case NEON::BI__builtin_neon_vluti2q_laneq_p16: + case NEON::BI__builtin_neon_vluti2q_laneq_p8: + case NEON::BI__builtin_neon_vluti2q_laneq_s16: + case NEON::BI__builtin_neon_vluti2q_laneq_s8: + case NEON::BI__builtin_neon_vluti2q_laneq_u16: + case NEON::BI__builtin_neon_vluti2q_laneq_u8: { + Int = Intrinsic::aarch64_neon_vluti2_lane; + llvm::Type *Tys[3]; + Tys[0] = Ty; + Tys[1] = Ops[0]->getType(); + Tys[2] = Ops[1]->getType(); + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); + } + case NEON::BI__builtin_neon_vluti4q_laneq_p8: + case NEON::BI__builtin_neon_vluti4q_laneq_s8: + case NEON::BI__builtin_neon_vluti4q_laneq_u8: { + Int = Intrinsic::aarch64_neon_vluti4q_laneq; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq"); + } + case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2: + case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: { + Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); + } } } diff --git a/clang/test/CodeGen/aarch64-neon-luti.c b/clang/test/CodeGen/aarch64-neon-luti.c new file mode 100644 index 0000000000000..4f10acdce302a --- /dev/null +++ b/clang/test/CodeGen/aarch64-neon-luti.c @@ -0,0 +1,433 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +#include <arm_neon.h> +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -target-feature +bf16 -O3 -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +uint8x16_t test_vluti2_lane_u8(uint8x8_t vn, uint8x8_t vm) { + return vluti2_lane_u8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_u8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v16i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +uint8x16_t test_vluti2_laneq_u8(uint8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_u8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_u8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v8i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +uint8x16_t test_vluti2q_lane_u8(uint8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_u8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_u8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +uint8x16_t test_vluti2q_laneq_u8(uint8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_u8(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +int8x16_t test_vluti2_lane_s8(int8x8_t vn, uint8x8_t vm) { + return vluti2_lane_s8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_s8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v16i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +int8x16_t test_vluti2_laneq_s8(int8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_s8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_s8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v8i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +int8x16_t test_vluti2q_lane_s8(int8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_s8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_s8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +int8x16_t test_vluti2q_laneq_s8(int8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_s8(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +poly8x16_t test_vluti2_lane_p8(poly8x8_t vn, uint8x8_t vm) { + return vluti2_lane_p8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_p8( +// CHECK-SAME: <8 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v16i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +poly8x16_t test_vluti2_laneq_p8(poly8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_p8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_p8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v8i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +poly8x16_t test_vluti2q_lane_p8(poly8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_p8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_p8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +poly8x16_t test_vluti2q_laneq_p8(poly8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_p8(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i8.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +uint16x8_t test_vluti2_lane_u16(uint16x4_t vn, uint8x8_t vm) { + return vluti2_lane_u16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_u16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i8.v16i8(<8 x i8> [[TMP0]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +uint16x8_t test_vluti2_laneq_u16(uint16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_u16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +uint16x8_t test_vluti2q_lane_u16(uint16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_u16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_u16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v16i8.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +uint16x8_t test_vluti2q_laneq_u16(uint16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_u16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i8.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +int16x8_t test_vluti2_lane_s16(int16x4_t vn, uint8x8_t vm) { + return vluti2_lane_s16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_s16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i8.v16i8(<8 x i8> [[TMP0]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +int16x8_t test_vluti2_laneq_s16(int16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_s16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +int16x8_t test_vluti2q_lane_s16(int16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_s16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_s16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v16i8.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +int16x8_t test_vluti2q_laneq_s16(int16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_s16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_lane_f16( +// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8i8.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE]] +// +float16x8_t test_vluti2_lane_f16(float16x4_t vn, uint8x8_t vm) { + return vluti2_lane_f16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2_laneq_f16( +// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8i8.v16i8(<8 x i8> [[TMP0]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE]] +// +float16x8_t test_vluti2_laneq_f16(float16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_f16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_lane_f16( +// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE]] +// +float16x8_t test_vluti2q_lane_f16(float16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_f16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti2q_laneq_f16( +// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v16i8.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x half> [[VLUTI2_LANE]] +// +float16x8_t test_vluti2q_laneq_f16(float16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_f16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_lane_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8i8.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE]] +// +bfloat16x8_t test_vluti2_lane_bf16(bfloat16x4_t vn, uint8x8_t vm) { + return vluti2_lane_bf16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2_laneq_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8i8.v16i8(<8 x i8> [[TMP0]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE]] +// +bfloat16x8_t test_vluti2_laneq_bf16(bfloat16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_bf16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_lane_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE]] +// +bfloat16x8_t test_vluti2q_lane_bf16(bfloat16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_bf16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti2q_laneq_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v16i8.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI2_LANE]] +// +bfloat16x8_t test_vluti2q_laneq_bf16(bfloat16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_bf16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i8.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +poly16x8_t test_vluti2_lane_p16(poly16x4_t vn, uint8x8_t vm) { + return vluti2_lane_p16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2_laneq_p16( +// CHECK-SAME: <4 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VN]] to <8 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i8.v16i8(<8 x i8> [[TMP0]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +poly16x8_t test_vluti2_laneq_p16(poly16x4_t vn, uint8x16_t vm) { + return vluti2_laneq_p16(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_lane_p16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +poly16x8_t test_vluti2q_lane_p16(poly16x8_t vn, uint8x8_t vm) { + return vluti2q_lane_p16(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti2q_laneq_p16( +// CHECK-SAME: <8 x i16> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[VN]] to <16 x i8> +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v16i8.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VLUTI2_LANE]] +// +poly16x8_t test_vluti2q_laneq_p16(poly16x8_t vn, uint8x16_t vm) { + return vluti2q_laneq_p16(vn, vm, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_u8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +uint8x16_t test_vluti4q_laneq_u8(uint8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_u8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_s8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +int8x16_t test_vluti4q_laneq_s8(int8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_s8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_p8( +// CHECK-SAME: <16 x i8> noundef [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +poly8x16_t test_vluti4q_laneq_p8(poly8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_p8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_u16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]] +// +uint16x8_t test_vluti4q_laneq_u16_x2(uint16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_u16_x2(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_s16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]] +// +int16x8_t test_vluti4q_laneq_s16_x2(int16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_s16_x2(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vluti4q_laneq_f16_x2( +// CHECK-SAME: [2 x <8 x half>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> [[VN_COERCE_FCA_0_EXTRACT]], <8 x half> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <8 x half> [[VLUTI4Q_LANEQ_X24]] +// +float16x8_t test_vluti4q_laneq_f16_x2(float16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_f16_x2(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vluti4q_laneq_bf16_x2( +// CHECK-SAME: [2 x <8 x bfloat>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> [[VN_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 2) +// CHECK-NEXT: ret <8 x bfloat> [[VLUTI4Q_LANEQ_X24]] +// +bfloat16x8_t test_vluti4q_laneq_bf16_x2(bfloat16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_bf16_x2(vn, vm, 2); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vluti4q_laneq_p16_x2( +// CHECK-SAME: [2 x <8 x i16>] alignstack(16) [[VN_COERCE:%.*]], <16 x i8> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 0 +// CHECK-NEXT: [[VN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[VN_COERCE]], 1 +// CHECK-NEXT: [[VLUTI4Q_LANEQ_X24:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> [[VN_COERCE_FCA_0_EXTRACT]], <8 x i16> [[VN_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VLUTI4Q_LANEQ_X24]] +// +poly16x8_t test_vluti4q_laneq_p16_x2(poly16x8x2_t vn, uint8x16_t vm) { + return vluti4q_laneq_p16_x2(vn, vm, 0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 9a71aaa9f4434..a72740907f1f1 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -545,6 +545,25 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in { def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic; } +let TargetPrefix = "aarch64" in { +def int_aarch64_neon_vluti2_lane : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, llvm_anyvector_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + +def int_aarch64_neon_vluti4q_laneq: DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + + +def int_aarch64_neon_vluti4q_laneq_x2: + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, + llvm_v16i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<3>>]>; +} + let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". class AdvSIMD_2Vector2Index_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 17d011086634c..296da7bc4d0fa 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8231,11 +8231,11 @@ multiclass SIMDTableLookupTied<bit op, string asm> { // AdvSIMD LUT //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc, RegisterOperand vectype, +class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc, RegisterOperand listtype, Operand idx_type, string asm, string kind> - : I<(outs vectype:$Rd), - (ins listtype:$Rn, vectype:$Rm, idx_type:$idx), + : I<(outs V128:$Rd), + (ins listtype:$Rn, V128:$Rm, idx_type:$idx), asm, "\t$Rd" # kind # ", $Rn, $Rm$idx", "", []>, Sched<[]> { bits<5> Rd; @@ -8255,22 +8255,22 @@ class BaseSIMDTableLookupIndexed<bit Q, bits<5> opc, RegisterOperand vectype, } multiclass BaseSIMDTableLookupIndexed2<string asm> { - def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, V128, VecListOne16b, VectorIndexS, asm, ".16b"> { + def _B : BaseSIMDTableLookupIndexed<0b1, {0b10,?,?,0b1}, VecListOne16b, VectorIndexS32b_timm, asm, ".16b"> { bits<2> idx; let Inst{14-13} = idx; } - def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, V128, VecListOne8h, VectorIndexH, asm, ".8h" > { + def _H : BaseSIMDTableLookupIndexed<0b1, {0b11,?,?,?}, VecListOne8h, VectorIndexH32b_timm, asm, ".8h" > { bits<3> idx; let Inst{14-12} = idx; } } multiclass BaseSIMDTableLookupIndexed4<string asm> { - def v16f8 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, V128, VecListOne16b, VectorIndexD, asm, ".16b"> { + def _B : BaseSIMDTableLookupIndexed<0b1, {0b01,?,0b10}, VecListOne16b, VectorIndexD32b_timm, asm, ".16b"> { bit idx; let Inst{14} = idx; } - def v8f16 : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, V128, VecListTwo8h, VectorIndexS, asm, ".8h" > { + def _H : BaseSIMDTableLookupIndexed<0b1, {0b01,?,?,0b1}, VecListTwo8h, VectorIndexS32b_timm, asm, ".8h" > { bits<2> idx; let Inst{14-13} = idx; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 91e5bc3caa102..d5038345a11bb 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6420,6 +6420,76 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), let Predicates = [HasLUT] in { defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">; defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">; + + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 V128:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 V128:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 VecListOne8h:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 VecListOne8h:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; + + def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq (v16i8 VecListOne16b:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexD32b_timm:$idx))), + (LUT4_B VecListOne16b:$Rn, V128:$Rm, VectorIndexD32b_timm:$idx)>; + + def : Pat<(v8i16 (int_aarch64_neon_vluti4q_laneq_x2 (v8i16 VecListOne8h:$Rn1), + (v8i16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), + (i32 VectorIndexS32b_timm:$idx))), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti4q_laneq_x2 (v8f16 VecListOne8h:$Rn1), + (v8f16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), + (i32 VectorIndexS32b_timm:$idx))), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; +} + +let Predicates = [HasLUT, HasBF16] in { + def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v4bf16 V64:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v4bf16 V64:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; + def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v8bf16 V128:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v8bf16 VecListOne8h:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), + (LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; + + def : Pat<(v8bf16 (int_aarch64_neon_vluti4q_laneq_x2 (v8bf16 VecListOne8h:$Rn1), + (v8bf16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), + (i32 VectorIndexS32b_timm:$idx))), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; } //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/neon-luti.ll b/llvm/test/CodeGen/AArch64/neon-luti.ll new file mode 100644 index 0000000000000..d46f04fbce847 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-luti.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon,+lut,+bf16 | FileCheck %s + +define <16 x i8> @test_luti2_lane_i8(<8 x i8> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v8i8(<8 x i8> %vn, <8 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <16 x i8> @test_luti2_laneq_i8(<8 x i8> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8.v16i8(<8 x i8> %vn, <16 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <16 x i8> @test_luti2q_lane_i8(<16 x i8> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v8i8(<16 x i8> %vn, <8 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <16 x i8> @test_luti2q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <8 x i16> @test_luti2_lane_i16(<4 x i16> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16.v8i8(<4 x i16> %vn, <8 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x i16> @test_luti2_laneq_i16(<4 x i16> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v4i16.v16i8(<4 x i16> %vn, <16 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x i16> @test_luti2q_lane_i16(<4 x i16> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16.v8i8(<4 x i16> %vn, <8 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x i16> @test_luti2q_laneq_i16(<8 x i16> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti2.lane.v8i16.v8i16.v16i8(<8 x i16> %vn, <16 x i8> %vm, i32 0) + ret <8 x i16> %res +} + +define <8 x half> @test_luti2_lane_f16(<4 x half> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4f16.v8i8(<4 x half> %vn, <8 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x half> @test_luti2_laneq_f16(<4 x half> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v4i16.v16i8(<4 x half> %vn, <16 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x half> @test_luti2q_lane_f16(<4 x half> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16.v8i8(<4 x half> %vn, <8 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x half> @test_luti2q_laneq_f16(<8 x half> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti2.lane.v8f16.v8f16.v16i8(<8 x half> %vn, <16 x i8> %vm, i32 0) + ret <8 x half> %res +} + +define <8 x bfloat> @test_luti2_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16.v8i8(<4 x bfloat> %vn, <8 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_luti2_laneq_bf16(<4 x bfloat> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2_laneq_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v4bf16.v16i8(<4 x bfloat> %vn, <16 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_luti2q_lane_bf16(<4 x bfloat> %vn, <8 x i8> %vm){ +; CHECK-LABEL: test_luti2q_lane_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16.v8i8(<4 x bfloat> %vn, <8 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_luti2q_laneq_bf16(<8 x bfloat> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti2q_laneq_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: luti2 v0.8h, { v0.8h }, v1[0] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti2.lane.v8bf16.v8bf16.v16i8(<8 x bfloat> %vn, <16 x i8> %vm, i32 0) + ret <8 x bfloat> %res +} + +define <16 x i8> @test_luti4q_laneq_i8(<16 x i8> %vn, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti4 v0.16b, { v0.16b }, v1[0] +; CHECK-NEXT: ret + %res= tail call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> %vn, <16 x i8> %vm, i32 0) + ret <16 x i8> %res +} + +define <8 x i16> @test_luti4q_laneq_x2_i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x i16> @llvm.aarch64.neon.vluti4q.laneq.x2.v8i16(<8 x i16> %vn1, <8 x i16> %vn2, <16 x i8> %vm, i32 1) + ret <8 x i16> %res +} + +define <8 x half> @test_luti4q_laneq_x2_f16(<8 x half>%vn1, <8 x half> %vn2, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x half> @llvm.aarch64.neon.vluti4q.laneq.x2.v8f16(<8 x half> %vn1, <8 x half> %vn2, <16 x i8> %vm, i32 1) + ret <8 x half> %res +} + +define <8 x bfloat> @test_luti4q_laneq_x2_bf16(<8 x bfloat>%vn1, <8 x bfloat> %vn2, <16 x i8> %vm){ +; CHECK-LABEL: test_luti4q_laneq_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: luti4 v0.8h, { v0.8h, v1.8h }, v2[1] +; CHECK-NEXT: ret + %res= tail call <8 x bfloat> @llvm.aarch64.neon.vluti4q.laneq.x2.v8bf16(<8 x bfloat> %vn1, <8 x bfloat> %vn2, <16 x i8> %vm, i32 1) + ret <8 x bfloat> %res +} >From 9e59cb5b50d58d60648d578ad5e1256dc17e00e6 Mon Sep 17 00:00:00 2001 From: Marian Lukac <marian.lu...@arm.com> Date: Fri, 28 Jun 2024 11:50:04 +0000 Subject: [PATCH 2/2] Cleaned up patterns for the instruction --- clang/include/clang/Basic/arm_neon.td | 2 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 97 +++++++-------------- 2 files changed, 31 insertions(+), 68 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 0dd76ce32fc20..7aa45f0ea3785 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2097,7 +2097,7 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r def VSTL1_LANE : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">; } -//Lookup table read with 2-bit/4-bit indices +// Lookup table read with 2-bit/4-bit indices let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcQcQUcQPc">; def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcQcQUcQPc">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d5038345a11bb..cd01316ad77ff 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6420,76 +6420,39 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), let Predicates = [HasLUT] in { defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">; defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">; + + multiclass Luti2_patterns<Instruction Instr, ValueType VT64, ValueType VT128>{ + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT64:$Rn, + v8i8:$Rm, i32:$idx)), + (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT64:$Rn, + v16i8:$Rm, i32:$idx)), + (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), + V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT128:$Rn, + v8i8:$Rm, i32:$idx)), + (Instr V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), + VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT128:$Rn, + v16i8:$Rm, i32:$idx)), + (Instr V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>; + } + + defm : Luti2_patterns<LUT2_B, v8i8, v16i8>; + defm : Luti2_patterns<LUT2_H, v4i16, v8i16>; + defm : Luti2_patterns<LUT2_H, v4f16, v8f16>; + defm : Luti2_patterns<LUT2_H, v4bf16, v8bf16>; - def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))), - (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; - def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))), - (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexS32b_timm:$idx)>; - def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))), - (LUT2_B V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; - def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))), - (LUT2_B V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>; - def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; - def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; - def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; - def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; - def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 V128:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; - def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 V128:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; - def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 VecListOne8h:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; - def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 VecListOne8h:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; - - def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq (v16i8 VecListOne16b:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexD32b_timm:$idx))), + def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq v16i8:$Rn, + v16i8:$Rm, i32:$idx)), (LUT4_B VecListOne16b:$Rn, V128:$Rm, VectorIndexD32b_timm:$idx)>; - def : Pat<(v8i16 (int_aarch64_neon_vluti4q_laneq_x2 (v8i16 VecListOne8h:$Rn1), - (v8i16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), - (i32 VectorIndexS32b_timm:$idx))), - (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; - def : Pat<(v8f16 (int_aarch64_neon_vluti4q_laneq_x2 (v8f16 VecListOne8h:$Rn1), - (v8f16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), - (i32 VectorIndexS32b_timm:$idx))), - (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; -} - -let Predicates = [HasLUT, HasBF16] in { - def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v4bf16 V64:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; - def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v4bf16 V64:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; - def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v8bf16 V128:$Rn), - (v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; - def : Pat<(v8bf16 (int_aarch64_neon_vluti2_lane (v8bf16 VecListOne8h:$Rn), - (v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), - (LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; - - def : Pat<(v8bf16 (int_aarch64_neon_vluti4q_laneq_x2 (v8bf16 VecListOne8h:$Rn1), - (v8bf16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), - (i32 VectorIndexS32b_timm:$idx))), - (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; + foreach VT = [v8i16, v8f16, v8bf16] in { + def : Pat<(VT (int_aarch64_neon_vluti4q_laneq_x2 VT:$Rn1, + VT:$Rn2, v16i8:$Rm, i32:$idx)), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; + } } //---------------------------------------------------------------------------- _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits