olista01 created this revision. olista01 added reviewers: rengolin, SjoerdMeijer, flyingforyou. Herald added a reviewer: javed.absar. Herald added subscribers: llvm-commits, chrib, kristof.beyls.
The ACLE spec which describes these intrinsics hasn't been published yet, but this is based on the final draft which will be published soon, and these have already been implemented by GCC. Repository: rL LLVM https://reviews.llvm.org/D46109 Files: include/clang/Basic/arm_neon.td include/clang/Basic/arm_neon_incl.td lib/CodeGen/CGBuiltin.cpp test/CodeGen/aarch64-neon-dot-product.c test/CodeGen/arm-neon-dot-product.c utils/TableGen/NeonEmitter.cpp
Index: utils/TableGen/NeonEmitter.cpp =================================================================== --- utils/TableGen/NeonEmitter.cpp +++ utils/TableGen/NeonEmitter.cpp @@ -995,6 +995,19 @@ if (!AppliedQuad) Bitwidth *= 2; break; + case '7': + if (AppliedQuad) + Bitwidth /= 2; + ElementBitwidth = 8; + break; + case '8': + ElementBitwidth = 8; + break; + case '9': + if (!AppliedQuad) + Bitwidth *= 2; + ElementBitwidth = 8; + break; default: llvm_unreachable("Unhandled character!"); } Index: test/CodeGen/arm-neon-dot-product.c =================================================================== --- /dev/null +++ test/CodeGen/arm-neon-dot-product.c @@ -0,0 +1,76 @@ +// RUN: %clang_cc1 -triple armv8-linux-gnueabihf -target-cpu cortex-a57 -target-feature +dotprod \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s + +// REQUIRES: arm-registered-target + +// Test ARM v8.2-A dot product intrinsics + +#include <arm_neon.h> + +uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_u32(a, b, c); +} + +uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_u32(a, b, c); +} + +int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_s32(a, b, c); +} + +int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_s32(a, b, c); +} + +uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_u32(a, b, c, 1); +} + +uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_u32(a, b, c, 1); +} + +int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_s32(a, b, c, 1); +} + +int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_s32(a, b, c, 1); +} Index: test/CodeGen/aarch64-neon-dot-product.c =================================================================== --- /dev/null +++ test/CodeGen/aarch64-neon-dot-product.c @@ -0,0 +1,117 @@ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +dotprod \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s + +// REQUIRES: aarch64-registered-target + +// Test AArch64 v8.2-A dot product intrinsics + +#include <arm_neon.h> + +uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_u32(a, b, c); +} + +uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_u32(a, b, c); +} + +int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_s32(a, b, c); +} + +int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_s32(a, b, c); +} + +uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_u32(a, b, c, 1); +} + +uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_u32(a, b, c, 1); +} + +uint32x2_t test_vdot_laneq_u32(uint32x2_t a, uint8x8_t b, uint8x16_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_laneq_u32(a, b, c, 1); +} + +uint32x4_t test_vdotq_laneq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_laneq_u32(a, b, c, 1); +} + +int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_lane_s32(a, b, c, 1); +} + +int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_lane_s32(a, b, c, 1); +} + +int32x2_t test_vdot_laneq_s32(int32x2_t a, int8x8_t b, int8x16_t c) { +// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> +// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]]) +// CHECK: ret <2 x i32> [[RESULT]] + return vdot_laneq_s32(a, b, c, 1); +} + +int32x4_t test_vdotq_laneq_s32(int32x4_t a, int8x16_t b, int8x16_t c) { +// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) +// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> +// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> +// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]]) +// CHECK: ret <4 x i32> [[RESULT]] + return vdotq_laneq_s32(a, b, c, 1); +} + Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -3864,6 +3864,8 @@ NEONMAP0(vcvtq_u16_v), NEONMAP0(vcvtq_u32_v), NEONMAP0(vcvtq_u64_v), + NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0), + NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), @@ -4058,6 +4060,8 @@ NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType), + NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0), + NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), @@ -4971,6 +4975,14 @@ } return SV; } + case NEON::BI__builtin_neon_vdot_v: + case NEON::BI__builtin_neon_vdotq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot"); + } } assert(Int && "Expected valid intrinsic number"); Index: include/clang/Basic/arm_neon_incl.td =================================================================== --- include/clang/Basic/arm_neon_incl.td +++ include/clang/Basic/arm_neon_incl.td @@ -253,6 +253,9 @@ // B,C,D: array of default elts, force 'Q' size modifier. // p: pointer type // c: const pointer type +// 7: vector of 8-bit elements, ignore 'Q' size modifier +// 8: vector of 8-bit elements, same width as default type +// 9: vector of 8-bit elements, force 'Q' size modifier // Every intrinsic subclasses Inst. class Inst <string n, string p, string t, Operation o> { Index: include/clang/Basic/arm_neon.td =================================================================== --- include/clang/Basic/arm_neon.td +++ include/clang/Basic/arm_neon.td @@ -199,6 +199,13 @@ (bitcast "int16_t", $p0), (bitcast "int16x8_t", $p1), $p2))>; +def OP_DOT_LN + : Op<(call "vdot", $p0, $p1, + (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>; +def OP_DOT_LNQ + : Op<(call "vdot", $p0, $p1, + (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>; + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -1575,3 +1582,12 @@ def SCALAR_VDUP_LANEH : IInst<"vdup_lane", "sdi", "Sh">; def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "sji", "Sh">; } + +// v8.2-A dot product instructions +let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in { + def DOT : SInst<"vdot", "dd88", "iQiUiQUi">; + def DOT_LANE : SOpInst<"vdot_lane", "dd87i", "iUiQiQUi", OP_DOT_LN>; +} +let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in { + def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>; +}
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits