fpetrogalli created this revision. fpetrogalli added reviewers: sdesmalen, ctetreau, efriedma, david-arm. Herald added subscribers: llvm-commits, cfe-commits, psnobl, rkruppe, hiraditya, kristof.beyls, tschuett. Herald added a reviewer: rengolin. Herald added projects: clang, LLVM.
List of intrinsics: svfloat32_t svbfdot[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3) svfloat32_t svbfdot[_n_f32](svfloat32_t op1, svbfloat16_t op2, bfloat16_t op3) svfloat32_t svbfdot_lane[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3, uint64_t imm_index) svfloat32_t svbfmmla[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3) svfloat32_t svbfmlalb[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3) svfloat32_t svbfmlalb[_n_f32](svfloat32_t op1, svbfloat16_t op2, bfloat16_t op3) svfloat32_t svbfmlalb_lane[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3, uint64_t imm_index) svfloat32_t svbfmlalt[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3) svfloat32_t svbfmlalt[_n_f32](svfloat32_t op1, svbfloat16_t op2, bfloat16_t op3) svfloat32_t svbfmlalt_lane[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3, uint64_t imm_index) svbfloat16_t svcvt_bf16[_f32]_m(svbfloat16_t inactive, svbool_t pg, svfloat32_t op) svbfloat16_t svcvt_bf16[_f32]_x(svbool_t pg, svfloat32_t op) svbfloat16_t svcvt_bf16[_f32]_z(svbool_t pg, svfloat32_t op) svbfloat16_t svcvtnt_bf16[_f32]_m(svbfloat16_t even, svbool_t pg, svfloat32_t op) svbfloat16_t svcvtnt_bf16[_f32]_x(svbfloat16_t even, svbool_t pg, svfloat32_t op) For reference, see section 7.2 of "Arm C Language Extensions for SVE - Version 00bet4" Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D82141 Files: clang/include/clang/Basic/arm_sve.td clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c clang/utils/TableGen/SveEmitter.cpp llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td llvm/lib/Target/AArch64/SVEInstrFormats.td llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll @@ -0,0 +1,243 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s + +; +; BFDOT +; + +define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfdot_f32: +; CHECK-NEXT: bfdot z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfdot_lane_0_f32: +; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[0] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfdot_lane_1_f32: +; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[1] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfdot_lane_2_f32: +; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[2] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfdot_lane_3_f32: +; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[3] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3) + ret <vscale x 4 x float> %out +} + +; +; BFMLALB +; + +define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_0_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[0] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_1_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[1] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_2_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[2] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_3_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[3] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_4_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[4] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_5_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[5] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_6_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[6] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalb_lane_7_f32: +; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[7] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7) + ret <vscale x 4 x float> %out +} + +; +; BFMLALT +; + +define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_0_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[0] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_1_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[1] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_2_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[2] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_3_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[3] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_4_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[4] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_5_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[5] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_6_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[6] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6) + ret <vscale x 4 x float> %out +} + +define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmlalt_lane_7_f32: +; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[7] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7) + ret <vscale x 4 x float> %out +} + +; +; BFMMLA +; + +define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { +; CHECK-LABEL: bfmmla_f32: +; CHECK-NEXT: bfmmla z0.s, z1.h, z2.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) + ret <vscale x 4 x float> %out +} + +; +; BFCVT +; + +define <vscale x 8 x bfloat> @cvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind { +; CHECK-LABEL: cvt_bf16_f32: +; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) + ret <vscale x 8 x bfloat> %out +} + +; +; BFCVTNT +; + +define <vscale x 8 x bfloat> @cvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind { +; CHECK-LABEL: cvtnt_bf16_f32: +; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) + ret <vscale x 8 x bfloat> %out +} + +declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) +declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) +declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) +declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) +declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) +declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) +declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) +declare <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>) +declare <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>) Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7460,6 +7460,11 @@ let Inst{20-16} = Zm; } +multiclass sve_bfloat_dot<string asm, SDPatternOperator op> { + def NAME : sve_bfloat_dot<asm>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>; +} + class sve_bfloat_dot_indexed<string asm> : sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> { @@ -7469,6 +7474,11 @@ let Inst{18-16} = Zm; } +multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> { + def NAME : sve_bfloat_dot_indexed<asm>; + def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>; +} + class sve_bfloat_matmul<string asm> : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { @@ -7486,6 +7496,11 @@ let ElementSize = ElementSizeH; } +multiclass sve_bfloat_matmul<string asm, SDPatternOperator op> { + def NAME : sve_bfloat_matmul<asm>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>; +} + class sve_bfloat_matmul_longvecl<bit BT, string asm> : sve_bfloat_matmul<asm> { let Inst{23} = 0b1; @@ -7493,6 +7508,11 @@ let Inst{10} = BT; } +multiclass sve_bfloat_matmul_longvecl<bit BT, string asm, SDPatternOperator op> { + def NAME : sve_bfloat_matmul_longvecl<BT, asm>; + def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>; +} + class sve_bfloat_matmul_longvecl_idx<bit BT, string asm> : sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> { @@ -7505,6 +7525,11 @@ let Inst{10} = BT; } +multiclass sve_bfloat_matmul_longvecl_idx<bit BT, string asm, SDPatternOperator op> { + def NAME : sve_bfloat_matmul_longvecl_idx<BT, asm>; + def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>; +} + class sve_bfloat_convert<bit N, string asm> : I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { @@ -7524,6 +7549,11 @@ let ElementSize = ElementSizeS; } +multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> { + def NAME : sve_bfloat_convert<N, asm>; + def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>; +} + //===----------------------------------------------------------------------===// // SVE Integer Matrix Multiply Group //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1239,15 +1239,15 @@ defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>; let Predicates = [HasBF16, HasSVE] in { - def BFDOT_ZZZ : sve_bfloat_dot<"bfdot">; - def BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot">; - def BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla">; - def BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb">; - def BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt">; - def BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb">; - def BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt">; - def BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt">; - def BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt">; + defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>; + defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>; + defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>; + defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>; + defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>; + defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>; + defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>; + defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_cvt_bf16f32>; + defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_cvtnt_bf16f32>; } // InstAliases Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1340,6 +1340,21 @@ [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>], [IntrNoMem]>; +class SVE_bfloat + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_nxv8bf16_ty, + llvm_nxv8bf16_ty], + [IntrNoMem]>; + +class SVE_bfloat_index + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_nxv8bf16_ty, + llvm_nxv8bf16_ty, + llvm_i64_ty], + [IntrNoMem, ImmArg<ArgIndex<3>>]>; + // // Vector tuple creation intrinsics (ACLE) // @@ -1793,6 +1808,9 @@ def int_aarch64_sve_fcvtzs_i64f16 : Builtin_SVCVT<"svcvt_s64_f16_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>; def int_aarch64_sve_fcvtzs_i64f32 : Builtin_SVCVT<"svcvt_s64_f32_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>; +def int_aarch64_sve_cvt_bf16f32 : Builtin_SVCVT<"svcvt_bf16_f32_m", llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>; +def int_aarch64_sve_cvtnt_bf16f32 : Builtin_SVCVT<"svcvtnt_bf16_f32_m", llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>; + def int_aarch64_sve_fcvtzu_i32f16 : Builtin_SVCVT<"svcvt_u32_f16_m", llvm_nxv4i32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>; def int_aarch64_sve_fcvtzu_i32f64 : Builtin_SVCVT<"svcvt_u32_f64_m", llvm_nxv4i32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>; def int_aarch64_sve_fcvtzu_i64f16 : Builtin_SVCVT<"svcvt_u64_f16_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>; @@ -2343,6 +2361,19 @@ // def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic; +// +// SVE ACLE: 7.2. BFloat16 extensions +// + +def int_aarch64_sve_bfdot : SVE_bfloat; +def int_aarch64_sve_bfmlalb : SVE_bfloat; +def int_aarch64_sve_bfmlalt : SVE_bfloat; + +def int_aarch64_sve_bfmmla : SVE_bfloat; + +def int_aarch64_sve_bfdot_lane : SVE_bfloat_index; +def int_aarch64_sve_bfmlalb_lane : SVE_bfloat_index; +def int_aarch64_sve_bfmlalt_lane : SVE_bfloat_index; } // Index: clang/utils/TableGen/SveEmitter.cpp =================================================================== --- clang/utils/TableGen/SveEmitter.cpp +++ clang/utils/TableGen/SveEmitter.cpp @@ -215,13 +215,13 @@ /// Return true if the intrinsic takes a splat operand. bool hasSplat() const { // These prototype modifiers are described in arm_sve.td. - return Proto.find_first_of("ajfrKLR@") != std::string::npos; + return Proto.find_first_of("ajfrKLR@~") != std::string::npos; } /// Return the parameter index of the splat operand. unsigned getSplatIdx() const { // These prototype modifiers are described in arm_sve.td. - auto Idx = Proto.find_first_of("ajfrKLR@"); + auto Idx = Proto.find_first_of("ajfrKLR@~"); assert(Idx != std::string::npos && Idx > 0 && "Prototype has no splat operand"); return Idx - 1; @@ -556,6 +556,12 @@ Bitwidth = ElementBitwidth; NumVectors = 0; break; + case '~': + ElementBitwidth = 16; + BFloat = true; + Float = false; + NumVectors = 0; + break; case 'R': ElementBitwidth /= 2; NumVectors = 0; @@ -688,6 +694,12 @@ Float = true; ElementBitwidth = 64; break; + case '$': + Predicate = false; + BFloat = true; + Float = false; + ElementBitwidth = 16; + break; case 'Q': Constant = true; Pointer = true; Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +svbfloat16_t test_svcvtnt_bf16_f32_x(svbool_t pg, svfloat32_t op) { + // CHECK-LABEL: test_svcvtnt_bf16_f32_x + // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op) + // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]] + return SVE_ACLE_FUNC(svcvtnt_bf16, _f32, _x, )(pg, op); +} + +svbfloat16_t test_svcvtnt_bf16_f32_m(svbfloat16_t inactive, svbool_t pg, svfloat32_t op) { + // CHECK-LABEL: test_svcvtnt_bf16_f32_m + // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op) + // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]] + return SVE_ACLE_FUNC(svcvtnt_bf16, _f32, _m, )(inactive, pg, op); +} Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +svbfloat16_t test_svcvt_bf16_f32_x(svbool_t pg, svfloat32_t op) { + // CHECK-LABEL: test_svcvt_bf16_f32_x + // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op) + // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]] + return SVE_ACLE_FUNC(svcvt_bf16, _f32, _x, )(pg, op); +} + +svbfloat16_t test_svcvt_bf16_f32_z(svbool_t pg, svfloat32_t op) { + // CHECK-LABEL: test_svcvt_bf16_f32_z + // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op) + // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]] + return SVE_ACLE_FUNC(svcvt_bf16, _f32, _z, )(pg, op); +} + +svbfloat16_t test_svcvt_bf16_f32_m(svbfloat16_t inactive, svbool_t pg, svfloat32_t op) { + // CHECK-LABEL: test_svcvt_bf16_f32_m + // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg) + // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op) + // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]] + return SVE_ACLE_FUNC(svcvt_bf16, _f32, _m, )(inactive, pg, op); +} Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +svfloat32_t test_bfmmla_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmmla_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmmla, _f32, , )(x, y, z); +} Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c @@ -0,0 +1,82 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +svfloat32_t test_svbfmlalt_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_svbfmlalt_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt, _f32, , )(x, y, z); +} + +svfloat32_t test_bfmlalt_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_0_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 0) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 0); +} + +svfloat32_t test_bfmlalt_lane_1_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_1_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 1) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 1); +} + +svfloat32_t test_bfmlalt_lane_2_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_2_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 2) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 2); +} + +svfloat32_t test_bfmlalt_lane_3_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_3_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 3) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 3); +} + +svfloat32_t test_bfmlalt_lane_4_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_4_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 4) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 4); +} + +svfloat32_t test_bfmlalt_lane_5_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_5_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 5) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 5); +} + +svfloat32_t test_bfmlalt_lane_6_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_6_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 6) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 6); +} + +svfloat32_t test_bfmlalt_lane_7_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_lane_7_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 7) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 7); +} + +svfloat32_t test_bfmlalt_n_f32(svfloat32_t x, svbfloat16_t y, bfloat16_t z) { + // CHECK-LABEL: @test_bfmlalt_n_f32( + // CHECK: %[[SPLAT:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %z) + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %[[SPLAT]]) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalt, _n_f32, , )(x, y, z); +} Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c @@ -0,0 +1,82 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +svfloat32_t test_svbfmlalb_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_svbfmlalb_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb, _f32, , )(x, y, z); +} + +svfloat32_t test_bfmlalb_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_0_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 0) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 0); +} + +svfloat32_t test_bfmlalb_lane_1_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_1_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 1) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 1); +} + +svfloat32_t test_bfmlalb_lane_2_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_2_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 2) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 2); +} + +svfloat32_t test_bfmlalb_lane_3_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_3_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 3) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 3); +} + +svfloat32_t test_bfmlalb_lane_4_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_4_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 4) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 4); +} + +svfloat32_t test_bfmlalb_lane_5_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_5_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 5) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 5); +} + +svfloat32_t test_bfmlalb_lane_6_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_6_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 6) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 6); +} + +svfloat32_t test_bfmlalb_lane_7_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_lane_7_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 7) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 7); +} + +svfloat32_t test_bfmlalb_n_f32(svfloat32_t x, svbfloat16_t y, bfloat16_t z) { + // CHECK-LABEL: @test_bfmlalb_n_f32( + // CHECK: %[[SPLAT:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %z) + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %[[SPLAT]]) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfmlalb, _n_f32, , )(x, y, z); +} Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c @@ -0,0 +1,54 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +svfloat32_t test_bfdot_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfdot_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfdot, _f32, , )(x, y, z); +} + +svfloat32_t test_bfdot_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfdot_lane_0_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 0) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 0); +} + +svfloat32_t test_bfdot_lane_1_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfdot_lane_1_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 1) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 1); +} + +svfloat32_t test_bfdot_lane_2_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfdot_lane_2_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 2) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 2); +} + +svfloat32_t test_bfdot_lane_3_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) { + // CHECK-LABEL: @test_bfdot_lane_3_f32( + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 3) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 3); +} + +svfloat32_t test_bfdot_n_f32(svfloat32_t x, svbfloat16_t y, bfloat16_t z) { + // CHECK-LABEL: @test_bfdot_n_f32( + // CHECK: %[[SPLAT:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %z) + // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %[[SPLAT]]) + // CHECK: ret <vscale x 4 x float> %[[RET]] + return SVE_ACLE_FUNC(svbfdot, _n_f32, , )(x, y, z); +} Index: clang/include/clang/Basic/arm_sve.td =================================================================== --- clang/include/clang/Basic/arm_sve.td +++ clang/include/clang/Basic/arm_sve.td @@ -71,6 +71,7 @@ // R: scalar of 1/2 width element type (splat to vector type) // r: scalar of 1/4 width element type (splat to vector type) // @: unsigned scalar of 1/4 width element type (splat to vector type) +// ~: bfloat scalar (splat to vector type) // e: 1/2 width unsigned elements, 2x element count // b: 1/4 width unsigned elements, 4x element count // h: 1/2 width elements, 2x element count @@ -96,6 +97,7 @@ // O: svfloat16_t // M: svfloat32_t // N: svfloat64_t +// $: svbfloat16_t // J: Prefetch type (sv_prfop) // A: pointer to int8_t @@ -487,6 +489,20 @@ let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_FP64) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC)" in { def SVLD1RO_BF : SInst<"svld1ro[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1ro">; } + +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVBFDOT : SInst<"svbfdot[_{0}]", "dd$$", "f", MergeNone, "aarch64_sve_bfdot">; + def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlalb">; + def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlalt">; + def SVBFMMLA : SInst<"svbfmmla[_{0}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmmla">; + def SVBFDOT_N : SInst<"svbfdot[_n_{0}]", "dd$~", "f", MergeNone, "aarch64_sve_bfdot">; + def SVBFMLAL_N : SInst<"svbfmlalb[_n_{0}]", "dd$~", "f", MergeNone, "aarch64_sve_bfmlalb">; + def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "dd$~", "f", MergeNone, "aarch64_sve_bfmlalt">; + def SVBFDOT_LANE : SInst<"svbfdot_lane[_{d}]", "dd$$n", "f", MergeNone, "aarch64_sve_bfdot_lane", [], [ImmCheck<3, ImmCheck0_3>]>; + def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{d}]", "dd$$n", "f", MergeNone, "aarch64_sve_bfmlalb_lane", [], [ImmCheck<3, ImmCheck0_7>]>; + def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{d}]", "dd$$n", "f", MergeNone, "aarch64_sve_bfmlalt_lane", [], [ImmCheck<3, ImmCheck0_7>]>; +} + //////////////////////////////////////////////////////////////////////////////// // Stores @@ -1010,6 +1026,13 @@ defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i", "aarch64_sve_fcvtzs", [IsOverloadCvt]>; defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l", "aarch64_sve_fcvtzs_i64f32">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + // svcvt_bf16_f32 + defm SVCVT_BF16_F32 : SInstCvtMXZ<"svcvt_bf16[_f32]", "ddPM", "dPM", "b", "aarch64_sve_cvt_bf16f32">; + // svcvtnt_bf16_f32 + defm SVCVTNT_BF16_F32 : SInstCvtMX<"svcvtnt_bf16[_f32]", "ddPM", "dPM", "b", "aarch64_sve_cvtnt_bf16f32">; +} + // svcvt_s##_f64 defm SVFCVTZS_S32_F64 : SInstCvtMXZ<"svcvt_s32[_f64]", "ttPd", "tPd", "d", "aarch64_sve_fcvtzs_i32f64">; defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l", "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits