[PATCH] D82141: [sve][acle] Add SVE BFloat16 extensions.

Francesco Petrogalli via Phabricator via cfe-commits Thu, 18 Jun 2020 20:44:23 -0700

fpetrogalli created this revision.
fpetrogalli added reviewers: sdesmalen, ctetreau, efriedma, david-arm.
Herald added subscribers: llvm-commits, cfe-commits, psnobl, rkruppe, 
hiraditya, kristof.beyls, tschuett.
Herald added a reviewer: rengolin.
Herald added projects: clang, LLVM.


List of intrinsics:

svfloat32_t svbfdot[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3)
svfloat32_t svbfdot[_n_f32](svfloat32_t op1, svbfloat16_t op2, bfloat16_t op3)
svfloat32_t svbfdot_lane[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t 
op3, uint64_t imm_index)

svfloat32_t svbfmmla[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3)

svfloat32_t svbfmlalb[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3)
svfloat32_t svbfmlalb[_n_f32](svfloat32_t op1, svbfloat16_t op2, bfloat16_t op3)
svfloat32_t svbfmlalb_lane[_f32](svfloat32_t op1, svbfloat16_t op2, 
svbfloat16_t op3, uint64_t imm_index)

svfloat32_t svbfmlalt[_f32](svfloat32_t op1, svbfloat16_t op2, svbfloat16_t op3)
svfloat32_t svbfmlalt[_n_f32](svfloat32_t op1, svbfloat16_t op2, bfloat16_t op3)
svfloat32_t svbfmlalt_lane[_f32](svfloat32_t op1, svbfloat16_t op2, 
svbfloat16_t op3, uint64_t imm_index)

svbfloat16_t svcvt_bf16[_f32]_m(svbfloat16_t inactive, svbool_t pg, svfloat32_t 
op)
svbfloat16_t svcvt_bf16[_f32]_x(svbool_t pg, svfloat32_t op)
svbfloat16_t svcvt_bf16[_f32]_z(svbool_t pg, svfloat32_t op)

svbfloat16_t svcvtnt_bf16[_f32]_m(svbfloat16_t even, svbool_t pg, svfloat32_t 
op)
svbfloat16_t svcvtnt_bf16[_f32]_x(svbfloat16_t even, svbool_t pg, svfloat32_t 
op)

For reference, see section 7.2 of "Arm C Language Extensions for SVE - Version 
00bet4"


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D82141

Files:
  clang/include/clang/Basic/arm_sve.td
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c
  clang/utils/TableGen/SveEmitter.cpp
  llvm/include/llvm/IR/IntrinsicsAArch64.td
  llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
  llvm/lib/Target/AArch64/SVEInstrFormats.td
  llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll

Index: llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
@@ -0,0 +1,243 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s
+
+;
+; BFDOT
+;
+
+define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfdot_f32:
+; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfdot_lane_0_f32:
+; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[0]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfdot_lane_1_f32:
+; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[1]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfdot_lane_2_f32:
+; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[2]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfdot_lane_3_f32:
+; CHECK-NEXT:  bfdot z0.s, z1.h, z2.h[3]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; BFMLALB
+;
+
+define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_0_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[0]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_1_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[1]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_2_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[2]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_3_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[3]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_4_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[4]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_5_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[5]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_6_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[6]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalb_lane_7_f32:
+; CHECK-NEXT:  bfmlalb z0.s, z1.h, z2.h[7]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; BFMLALT
+;
+
+define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_0_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[0]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_1_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[1]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_2_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[2]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_3_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[3]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_4_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[4]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_5_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[5]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_6_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[6]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmlalt_lane_7_f32:
+; CHECK-NEXT:  bfmlalt z0.s, z1.h, z2.h[7]
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; BFMMLA
+;
+
+define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: bfmmla_f32:
+; CHECK-NEXT:  bfmmla z0.s, z1.h, z2.h
+; CHECK-NEXT:  ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla.nxv4f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; BFCVT
+;
+
+define <vscale x 8 x bfloat> @cvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
+; CHECK-LABEL: cvt_bf16_f32:
+; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+;
+; BFCVTNT
+;
+
+define <vscale x 8 x bfloat> @cvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
+; CHECK-LABEL: cvtnt_bf16_f32:
+; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla.nxv4f32(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7460,6 +7460,11 @@
   let Inst{20-16} = Zm;
 }
 
+multiclass sve_bfloat_dot<string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_dot<asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
 class sve_bfloat_dot_indexed<string asm>
 : sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
   (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> {
@@ -7469,6 +7474,11 @@
   let Inst{18-16} = Zm;
 }
 
+multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_dot_indexed<asm>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>;
+}
+
 class sve_bfloat_matmul<string asm>
 : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
   asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
@@ -7486,6 +7496,11 @@
   let ElementSize = ElementSizeH;
 }
 
+multiclass sve_bfloat_matmul<string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_matmul<asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
 class sve_bfloat_matmul_longvecl<bit BT, string asm>
 : sve_bfloat_matmul<asm> {
   let Inst{23}    = 0b1;
@@ -7493,6 +7508,11 @@
   let Inst{10}    = BT;
 }
 
+multiclass sve_bfloat_matmul_longvecl<bit BT, string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_matmul_longvecl<BT, asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
 class sve_bfloat_matmul_longvecl_idx<bit BT, string asm>
 : sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
   (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> {
@@ -7505,6 +7525,11 @@
   let Inst{10}    = BT;
 }
 
+multiclass sve_bfloat_matmul_longvecl_idx<bit BT, string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_matmul_longvecl_idx<BT, asm>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>;
+}
+
 class sve_bfloat_convert<bit N, string asm>
 : I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn),
   asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> {
@@ -7524,6 +7549,11 @@
   let ElementSize = ElementSizeS;
 }
 
+multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_convert<N, asm>;
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE Integer Matrix Multiply Group
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1239,15 +1239,15 @@
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  int_aarch64_sve_fsqrt>;
 
   let Predicates = [HasBF16, HasSVE] in {
-    def BFDOT_ZZZ    : sve_bfloat_dot<"bfdot">;
-    def BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot">;
-    def BFMMLA_ZZZ  : sve_bfloat_matmul<"bfmmla">;
-    def BFMMLA_B_ZZZ  : sve_bfloat_matmul_longvecl<0b0, "bfmlalb">;
-    def BFMMLA_T_ZZZ  : sve_bfloat_matmul_longvecl<0b1, "bfmlalt">;
-    def BFMMLA_B_ZZI  : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb">;
-    def BFMMLA_T_ZZI  : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt">;
-    def BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt">;
-    def BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt">;
+    defm BFDOT_ZZZ    : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
+    defm BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
+    defm BFMMLA_ZZZ   : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
+    defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
+    defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
+    defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
+    defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
+    defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_cvt_bf16f32>;
+    defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_cvtnt_bf16f32>;
   }
 
   // InstAliases
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1340,6 +1340,21 @@
                 [LLVMMatchType<0>, LLVMSubdivide4VectorType<0>, LLVMSubdivide4VectorType<0>],
                 [IntrNoMem]>;
 
+class SVE_bfloat
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 llvm_nxv8bf16_ty,
+                 llvm_nxv8bf16_ty],
+                [IntrNoMem]>;
+
+class SVE_bfloat_index
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 llvm_nxv8bf16_ty,
+                 llvm_nxv8bf16_ty,
+                 llvm_i64_ty],
+                [IntrNoMem, ImmArg<ArgIndex<3>>]>;
+
 //
 // Vector tuple creation intrinsics (ACLE)
 //
@@ -1793,6 +1808,9 @@
 def int_aarch64_sve_fcvtzs_i64f16   : Builtin_SVCVT<"svcvt_s64_f16_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
 def int_aarch64_sve_fcvtzs_i64f32   : Builtin_SVCVT<"svcvt_s64_f32_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv4f32_ty>;
 
+def int_aarch64_sve_cvt_bf16f32     : Builtin_SVCVT<"svcvt_bf16_f32_m",   llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_cvtnt_bf16f32   : Builtin_SVCVT<"svcvtnt_bf16_f32_m", llvm_nxv8bf16_ty, llvm_nxv8i1_ty, llvm_nxv4f32_ty>;
+
 def int_aarch64_sve_fcvtzu_i32f16   : Builtin_SVCVT<"svcvt_u32_f16_m", llvm_nxv4i32_ty, llvm_nxv4i1_ty, llvm_nxv8f16_ty>;
 def int_aarch64_sve_fcvtzu_i32f64   : Builtin_SVCVT<"svcvt_u32_f64_m", llvm_nxv4i32_ty, llvm_nxv2i1_ty, llvm_nxv2f64_ty>;
 def int_aarch64_sve_fcvtzu_i64f16   : Builtin_SVCVT<"svcvt_u64_f16_m", llvm_nxv2i64_ty, llvm_nxv2i1_ty, llvm_nxv8f16_ty>;
@@ -2343,6 +2361,19 @@
 //
 def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
 
+//
+// SVE ACLE: 7.2. BFloat16 extensions
+//
+
+def int_aarch64_sve_bfdot   : SVE_bfloat;
+def int_aarch64_sve_bfmlalb : SVE_bfloat;
+def int_aarch64_sve_bfmlalt : SVE_bfloat;
+
+def int_aarch64_sve_bfmmla  : SVE_bfloat;
+
+def int_aarch64_sve_bfdot_lane   : SVE_bfloat_index;
+def int_aarch64_sve_bfmlalb_lane : SVE_bfloat_index;
+def int_aarch64_sve_bfmlalt_lane : SVE_bfloat_index;
 }
 
 //
Index: clang/utils/TableGen/SveEmitter.cpp
===================================================================
--- clang/utils/TableGen/SveEmitter.cpp
+++ clang/utils/TableGen/SveEmitter.cpp
@@ -215,13 +215,13 @@
   /// Return true if the intrinsic takes a splat operand.
   bool hasSplat() const {
     // These prototype modifiers are described in arm_sve.td.
-    return Proto.find_first_of("ajfrKLR@") != std::string::npos;
+    return Proto.find_first_of("ajfrKLR@~") != std::string::npos;
   }
 
   /// Return the parameter index of the splat operand.
   unsigned getSplatIdx() const {
     // These prototype modifiers are described in arm_sve.td.
-    auto Idx = Proto.find_first_of("ajfrKLR@");
+    auto Idx = Proto.find_first_of("ajfrKLR@~");
     assert(Idx != std::string::npos && Idx > 0 &&
            "Prototype has no splat operand");
     return Idx - 1;
@@ -556,6 +556,12 @@
     Bitwidth = ElementBitwidth;
     NumVectors = 0;
     break;
+  case '~':
+    ElementBitwidth = 16;
+    BFloat = true;
+    Float = false;
+    NumVectors = 0;
+    break;
   case 'R':
     ElementBitwidth /= 2;
     NumVectors = 0;
@@ -688,6 +694,12 @@
     Float = true;
     ElementBitwidth = 64;
     break;
+  case '$':
+    Predicate = false;
+    BFloat = true;
+    Float = false;
+    ElementBitwidth = 16;
+    break;
   case 'Q':
     Constant = true;
     Pointer = true;
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvtnt.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svcvtnt_bf16_f32_x(svbool_t pg, svfloat32_t op) {
+  // CHECK-LABEL: test_svcvtnt_bf16_f32_x
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svcvtnt_bf16, _f32, _x, )(pg, op);
+}
+
+svbfloat16_t test_svcvtnt_bf16_f32_m(svbfloat16_t inactive, svbool_t pg, svfloat32_t op) {
+  // CHECK-LABEL: test_svcvtnt_bf16_f32_m
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvtnt.bf16f32(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svcvtnt_bf16, _f32, _m, )(inactive, pg, op);
+}
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_cvt-bfloat.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svbfloat16_t test_svcvt_bf16_f32_x(svbool_t pg, svfloat32_t op) {
+  // CHECK-LABEL: test_svcvt_bf16_f32_x
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svcvt_bf16, _f32, _x, )(pg, op);
+}
+
+svbfloat16_t test_svcvt_bf16_f32_z(svbool_t pg, svfloat32_t op) {
+  // CHECK-LABEL: test_svcvt_bf16_f32_z
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svcvt_bf16, _f32, _z, )(pg, op);
+}
+
+svbfloat16_t test_svcvt_bf16_f32_m(svbfloat16_t inactive, svbool_t pg, svfloat32_t op) {
+  // CHECK-LABEL: test_svcvt_bf16_f32_m
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.cvt.bf16f32(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %[[PG]], <vscale x 4 x float> %op)
+  // CHECK: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svcvt_bf16, _f32, _m, )(inactive, pg, op);
+}
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmmla.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svfloat32_t test_bfmmla_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmmla_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmmla, _f32, , )(x, y, z);
+}
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svfloat32_t test_svbfmlalt_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_svbfmlalt_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt, _f32, , )(x, y, z);
+}
+
+svfloat32_t test_bfmlalt_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_0_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 0)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 0);
+}
+
+svfloat32_t test_bfmlalt_lane_1_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_1_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 1)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 1);
+}
+
+svfloat32_t test_bfmlalt_lane_2_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_2_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 2)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 2);
+}
+
+svfloat32_t test_bfmlalt_lane_3_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_3_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 3)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 3);
+}
+
+svfloat32_t test_bfmlalt_lane_4_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_4_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 4)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 4);
+}
+
+svfloat32_t test_bfmlalt_lane_5_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_5_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 5)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 5);
+}
+
+svfloat32_t test_bfmlalt_lane_6_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_6_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 6)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 6);
+}
+
+svfloat32_t test_bfmlalt_lane_7_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_lane_7_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 7)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt_lane, _f32, , )(x, y, z, 7);
+}
+
+svfloat32_t test_bfmlalt_n_f32(svfloat32_t x, svbfloat16_t y, bfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalt_n_f32(
+  // CHECK: %[[SPLAT:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %z)
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %[[SPLAT]])
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalt, _n_f32, , )(x, y, z);
+}
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16  -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svfloat32_t test_svbfmlalb_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_svbfmlalb_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb, _f32, , )(x, y, z);
+}
+
+svfloat32_t test_bfmlalb_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_0_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 0)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 0);
+}
+
+svfloat32_t test_bfmlalb_lane_1_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_1_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 1)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 1);
+}
+
+svfloat32_t test_bfmlalb_lane_2_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_2_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 2)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 2);
+}
+
+svfloat32_t test_bfmlalb_lane_3_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_3_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 3)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 3);
+}
+
+svfloat32_t test_bfmlalb_lane_4_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_4_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 4)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 4);
+}
+
+svfloat32_t test_bfmlalb_lane_5_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_5_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 5)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 5);
+}
+
+svfloat32_t test_bfmlalb_lane_6_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_6_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 6)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 6);
+}
+
+svfloat32_t test_bfmlalb_lane_7_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_lane_7_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 7)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb_lane, _f32, , )(x, y, z, 7);
+}
+
+svfloat32_t test_bfmlalb_n_f32(svfloat32_t x, svbfloat16_t y, bfloat16_t z) {
+  // CHECK-LABEL: @test_bfmlalb_n_f32(
+  // CHECK: %[[SPLAT:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %z)
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %[[SPLAT]])
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfmlalb, _n_f32, , )(x, y, z);
+}
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+svfloat32_t test_bfdot_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfdot_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfdot, _f32, , )(x, y, z);
+}
+
+svfloat32_t test_bfdot_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfdot_lane_0_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 0)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 0);
+}
+
+svfloat32_t test_bfdot_lane_1_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfdot_lane_1_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 1)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 1);
+}
+
+svfloat32_t test_bfdot_lane_2_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfdot_lane_2_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 2)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 2);
+}
+
+svfloat32_t test_bfdot_lane_3_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
+  // CHECK-LABEL: @test_bfdot_lane_3_f32(
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %z, i64 3)
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfdot_lane, _f32, , )(x, y, z, 3);
+}
+
+svfloat32_t test_bfdot_n_f32(svfloat32_t x, svbfloat16_t y, bfloat16_t z) {
+  // CHECK-LABEL: @test_bfdot_n_f32(
+  // CHECK: %[[SPLAT:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %z)
+  // CHECK: %[[RET:.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.nxv4f32(<vscale x 4 x float> %x, <vscale x 8 x bfloat> %y, <vscale x 8 x bfloat> %[[SPLAT]])
+  // CHECK: ret <vscale x 4 x float> %[[RET]]
+  return SVE_ACLE_FUNC(svbfdot, _n_f32, , )(x, y, z);
+}
Index: clang/include/clang/Basic/arm_sve.td
===================================================================
--- clang/include/clang/Basic/arm_sve.td
+++ clang/include/clang/Basic/arm_sve.td
@@ -71,6 +71,7 @@
 // R: scalar of 1/2 width element type (splat to vector type)
 // r: scalar of 1/4 width element type (splat to vector type)
 // @: unsigned scalar of 1/4 width element type (splat to vector type)
+// ~: bfloat scalar (splat to vector type)
 // e: 1/2 width unsigned elements, 2x element count
 // b: 1/4 width unsigned elements, 4x element count
 // h: 1/2 width elements, 2x element count
@@ -96,6 +97,7 @@
 // O: svfloat16_t
 // M: svfloat32_t
 // N: svfloat64_t
+// $: svbfloat16_t
 
 // J: Prefetch type (sv_prfop)
 // A: pointer to int8_t
@@ -487,6 +489,20 @@
 let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_FP64) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC)" in {
   def SVLD1RO_BF : SInst<"svld1ro[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1ro">;
 }
+
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+  def SVBFDOT   : SInst<"svbfdot[_{0}]",   "dd$$", "f", MergeNone, "aarch64_sve_bfdot">;
+  def SVBFMLALB : SInst<"svbfmlalb[_{0}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlalb">;
+  def SVBFMLALT : SInst<"svbfmlalt[_{0}]", "dd$$", "f", MergeNone, "aarch64_sve_bfmlalt">;
+  def SVBFMMLA  : SInst<"svbfmmla[_{0}]",  "dd$$", "f", MergeNone, "aarch64_sve_bfmmla">;
+  def SVBFDOT_N   : SInst<"svbfdot[_n_{0}]",   "dd$~", "f", MergeNone, "aarch64_sve_bfdot">;
+  def SVBFMLAL_N  : SInst<"svbfmlalb[_n_{0}]", "dd$~", "f", MergeNone, "aarch64_sve_bfmlalb">;
+  def SVBFMLALT_N : SInst<"svbfmlalt[_n_{0}]", "dd$~", "f", MergeNone, "aarch64_sve_bfmlalt">;
+  def SVBFDOT_LANE   : SInst<"svbfdot_lane[_{d}]",   "dd$$n", "f", MergeNone, "aarch64_sve_bfdot_lane",   [], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{d}]", "dd$$n", "f", MergeNone, "aarch64_sve_bfmlalb_lane", [], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{d}]", "dd$$n", "f", MergeNone, "aarch64_sve_bfmlalt_lane", [], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Stores
 
@@ -1010,6 +1026,13 @@
 defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i",  "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
 defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l",  "aarch64_sve_fcvtzs_i64f32">;
 
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+  // svcvt_bf16_f32
+  defm SVCVT_BF16_F32 : SInstCvtMXZ<"svcvt_bf16[_f32]", "ddPM", "dPM", "b",  "aarch64_sve_cvt_bf16f32">;
+  // svcvtnt_bf16_f32
+  defm SVCVTNT_BF16_F32 : SInstCvtMX<"svcvtnt_bf16[_f32]", "ddPM", "dPM", "b",  "aarch64_sve_cvtnt_bf16f32">;
+}
+
 // svcvt_s##_f64
 defm SVFCVTZS_S32_F64 : SInstCvtMXZ<"svcvt_s32[_f64]", "ttPd", "tPd", "d",  "aarch64_sve_fcvtzs_i32f64">;
 defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l",  "aarch64_sve_fcvtzs", [IsOverloadCvt]>;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D82141: [sve][acle] Add SVE BFloat16 extensions.

Reply via email to